From 28a009b99e1383f3e00237b3d69bf42137510431 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Sat, 22 Jun 2024 00:30:58 -0400 Subject: [PATCH 01/33] +mdb atlas --- .github/workflows/contrib-tests.yml | 11 + autogen/agentchat/contrib/vectordb/base.py | 6 +- autogen/agentchat/contrib/vectordb/mongodb.py | 292 ++++++++++++++++++ notebook/agentchat_mongodb_RetrieveChat.ipynb | 269 ++++++++++++++++ setup.py | 1 + .../contrib/vectordb/test_mongodb.py | 135 ++++++++ 6 files changed, 713 insertions(+), 1 deletion(-) create mode 100644 autogen/agentchat/contrib/vectordb/mongodb.py create mode 100644 notebook/agentchat_mongodb_RetrieveChat.ipynb create mode 100644 test/agentchat/contrib/vectordb/test_mongodb.py diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index 7d8a932b0254..6c02275c6027 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -85,6 +85,14 @@ jobs: --health-retries 5 ports: - 5432:5432 + mongodb: + image: mongodb/mongodb-atlas-local:latest + restart: unless-stopped + ports: + - "27017:27017" + environment: + MONGODB_INITDB_ROOT_USERNAME: mongodb_user + MONGODB_INITDB_ROOT_PASSWORD: mongodb_password steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -102,6 +110,9 @@ jobs: - name: Install pgvector when on linux run: | pip install -e .[retrievechat-pgvector] + - name: Install mongodb when on linux + run: | + pip install -e .[retrievechat-mongodb] - name: Install unstructured when python-version is 3.9 and on linux if: matrix.python-version == '3.9' run: | diff --git a/autogen/agentchat/contrib/vectordb/base.py b/autogen/agentchat/contrib/vectordb/base.py index 29a080086193..5ce666a9d9e3 100644 --- a/autogen/agentchat/contrib/vectordb/base.py +++ b/autogen/agentchat/contrib/vectordb/base.py @@ -185,7 +185,7 @@ class VectorDBFactory: Factory class for creating vector databases. """ - PREDEFINED_VECTOR_DB = ["chroma", "pgvector"] + PREDEFINED_VECTOR_DB = ["chroma", "pgvector", "mongodb"] @staticmethod def create_vector_db(db_type: str, **kwargs) -> VectorDB: @@ -207,6 +207,10 @@ def create_vector_db(db_type: str, **kwargs) -> VectorDB: from .pgvectordb import PGVectorDB return PGVectorDB(**kwargs) + if db_type.lower() in ["mdb", "mongodb", "atlas"]: + from .mongodb import MongoDBAtlasVectorDB + + return MongoDBAtlasVectorDB(**kwargs) else: raise ValueError( f"Unsupported vector database type: {db_type}. Valid types are {VectorDBFactory.PREDEFINED_VECTOR_DB}." diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py new file mode 100644 index 000000000000..0b537fd30f88 --- /dev/null +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -0,0 +1,292 @@ +from typing import Callable, List, Literal + +import numpy as np +from pymongo import MongoClient, errors +from pymongo.operations import SearchIndexModel +from sentence_transformers import SentenceTransformer + +from .base import Document, ItemID, QueryResults, VectorDB +from .utils import get_logger + +logger = get_logger(__name__) + + +class MongoDBAtlasVectorDB(VectorDB): + """ + A Collection object for MongoDB. + """ + + def __init__( + self, + connection_string: str = "", + database_name: str = "vector_db", + embedding_function: Callable = SentenceTransformer("all-MiniLM-L6-v2").encode, + ): + """ + Initialize the vector database. + + Args: + connection_string: str | The MongoDB connection string to connect to. Default is ''. + database_name: str | The name of the database. Default is 'vector_db'. + embedding_function: The embedding function used to generate the vector representation. + """ + if embedding_function: + self.embedding_function = embedding_function + try: + self.client = MongoClient(connection_string) + self.client.admin.command("ping") + except errors.ServerSelectionTimeoutError as err: + raise ConnectionError("Could not connect to MongoDB server") from err + + self.db = self.client[database_name] + self.active_collection = None + # This will get the model dimension size by computing the embeddings dimensions + sentences = [ + "The weather is lovely today in paradise.", + ] + embeddings = self.embedding_function(sentences) + self.dimensions = len(embeddings[0]) + + def list_collections(self): + """ + List the collections in the vector database. + + Returns: + List[str] | The list of collections. + """ + try: + return self.db.list_collection_names() + except Exception as err: + raise err + + def create_collection( + self, + collection_name: str, + overwrite: bool = False, + get_or_create: bool = True, + index_name: str = "default_index", + similarity: Literal["euclidean", "cosine", "dotProduct"] = "cosine", + ): + """ + Create a collection in the vector database and create a vector search index in the collection. + + Args: + collection_name: str | The name of the collection. + index_name: str | The name of the index. + similarity: str | The similarity metric for the vector search index. + overwrite: bool | Whether to overwrite the collection if it exists. Default is False. + get_or_create: bool | Whether to get the collection if it exists. Default is True + """ + # if overwrite is False and get_or_create is False, raise a ValueError + if not overwrite and not get_or_create: + raise ValueError("If overwrite is False, get_or_create must be True.") + # If overwrite is True and the collection already exists, drop the existing collection + collection_names = self.db.list_collection_names() + if overwrite and collection_name in collection_names: + self.db.drop_collection(collection_name) + # If get_or_create is True and the collection already exists, return the existing collection + if get_or_create and collection_name in collection_names: + return self.db[collection_name] + # If get_or_create is False and the collection already exists, raise a ValueError + if not get_or_create and collection_name in collection_names: + raise ValueError(f"Collection {collection_name} already exists.") + + # Create a new collection + collection = self.db.create_collection(collection_name) + # Create a vector search index in the collection + search_index_model = SearchIndexModel( + definition={ + "fields": [ + {"type": "vector", "numDimensions": self.dimensions, "path": "embedding", "similarity": similarity}, + ] + }, + name=index_name, + type="vectorSearch", + ) + # Create the search index + try: + collection.create_search_index(model=search_index_model) + return collection + except Exception as e: + logger.error(f"Error creating search index: {e}") + raise e + + def get_collection(self, collection_name: str = None): + """ + Get the collection from the vector database. + + Args: + collection_name: str | The name of the collection. Default is None. If None, return the + current active collection. + + Returns: + Collection | The collection object. + """ + if collection_name is None: + if self.active_collection is None: + raise ValueError("No collection is specified.") + else: + logger.debug( + f"No collection is specified. Using current active collection {self.active_collection.name}." + ) + else: + if collection_name not in self.list_collections(): + raise ValueError(f"Collection {collection_name} does not exist.") + if self.active_collection is None: + self.active_collection = self.db[collection_name] + return self.active_collection + + def delete_collection(self, collection_name: str): + """ + Delete the collection from the vector database. + + Args: + collection_name: str | The name of the collection. + """ + return self.db[collection_name].drop() + + def insert_docs(self, docs: List[Document], collection_name: str = None, upsert: bool = False): + """ + Insert documents into the collection of the vector database. + + Args: + docs: List[Document] | A list of documents. Each document is a TypedDict `Document`. + collection_name: str | The name of the collection. Default is None. + upsert: bool | Whether to update the document if it exists. Default is False. + """ + if not docs: + return + if docs[0].get("content") is None: + raise ValueError("The document content is required.") + if docs[0].get("id") is None: + raise ValueError("The document id is required.") + collection = self.get_collection(collection_name) + for doc in docs: + if "embedding" not in doc: + doc["embedding"] = np.array(self.embedding_function([str(doc["content"])])).tolist()[0] + if upsert: + for doc in docs: + return collection.replace_one({"id": doc["id"]}, doc, upsert=True) + else: + return collection.insert_many(docs) + + def update_docs(self, docs: List[Document], collection_name: str = None): + """ + Update documents in the collection of the vector database. + + Args: + docs: List[Document] | A list of documents. + collection_name: str | The name of the collection. Default is None. + """ + return self.insert_docs(docs, collection_name, upsert=True) + + def delete_docs(self, ids: List[ItemID], collection_name: str = None): + """ + Delete documents from the collection of the vector database. + + Args: + ids: List[ItemID] | A list of document ids. Each id is a typed `ItemID`. + collection_name: str | The name of the collection. Default is None. + """ + collection = self.get_collection(collection_name) + return collection.delete_many({"id": {"$in": ids}}) + + def get_docs_by_ids(self, ids: List[ItemID] = None, collection_name: str = None): + """ + Retrieve documents from the collection of the vector database based on the ids. + + Args: + ids: List[ItemID] | A list of document ids. If None, will return all the documents. Default is None. + collection_name: str | The name of the collection. Default is None. + """ + results = [] + if ids is None: + collection = self.get_collection(collection_name) + results = list(collection.find({}, {"embedding": 0})) + else: + for id in ids: + id = str(id) + collection = self.get_collection(collection_name) + results = list(collection.find({"id": {"$in": ids}}, {"embedding": 0})) + return results + + def retrieve_docs( + self, + queries: List[str], + collection_name: str = None, + n_results: int = 10, + distance_threshold: float = -1, + index_name: str = "default", + **kwargs, + ) -> QueryResults: + """ + Retrieve documents from the collection of the vector database based on the queries. + + Args: + queries: List[str] | A list of queries. Each query is a string. + collection_name: str | The name of the collection. Default is None. + n_results: int | The number of relevant documents to return. Default is 10. + distance_threshold: float | The threshold for the distance score, only distance smaller than it will be + returned. Don't filter with it if < 0. Default is -1. + kwargs: Dict | Additional keyword arguments. + + Returns: + QueryResults | The query results. Each query result is a list of list of tuples containing the document and + the distance. + """ + results = [] + for query_text in queries: + query_vector = np.array(self.embedding_function([query_text])).tolist()[0] + # Find documents with similar vectors using the specified index + search_collection = self.get_collection(collection_name) + pipeline = [ + { + "$vectorSearch": { + "index": index_name, + "limit": n_results, + "numCandidates": n_results, + "queryVector": query_vector, + "path": "embedding", + } + }, + {"$project": {"score": {"$meta": "vectorSearchScore"}}}, + ] + if distance_threshold >= 0.00: + similarity_threshold = 1 - distance_threshold + pipeline.append({"$match": {"score": {"gte": similarity_threshold}}}) + + # do a lookup on the same collection + pipeline.append( + { + "$lookup": { + "from": collection_name, + "localField": "_id", + "foreignField": "_id", + "as": "full_document_array", + } + } + ) + pipeline.append( + { + "$addFields": { + "full_document": { + "$arrayElemAt": [ + { + "$map": { + "input": "$full_document_array", + "as": "doc", + "in": {"id": "$$doc.id", "content": "$$doc.content"}, + } + }, + 0, + ] + } + } + } + ) + pipeline.append({"$project": {"full_document_array": 0, "embedding": 0}}) + tmp_results = [] + for doc in search_collection.aggregate(pipeline): + tmp_results.append((doc["full_document"], 1 - doc["score"])) + results.append(tmp_results) + return results diff --git a/notebook/agentchat_mongodb_RetrieveChat.ipynb b/notebook/agentchat_mongodb_RetrieveChat.ipynb new file mode 100644 index 000000000000..2f49288bf52e --- /dev/null +++ b/notebook/agentchat_mongodb_RetrieveChat.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using RetrieveChat Powered by MongoDB Atlas for Retrieve Augmented Code Generation and Question Answering\n", + "\n", + "AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation.\n", + "Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n", + "\n", + "RetrieveChat is a conversational system for retrieval-augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `RetrieveAssistantAgent` and `RetrieveUserProxyAgent` implement a different auto-reply mechanism corresponding to the RetrieveChat prompts.\n", + "\n", + "## Table of Contents\n", + "We'll demonstrate six examples of using RetrieveChat for code generation and question answering:\n", + "\n", + "- [Example 1: Generate code based off docstrings w/o human feedback](#example-1)\n", + "\n", + "````{=mdx}\n", + ":::info Requirements\n", + "Some extra dependencies are needed for this notebook, which can be installed via pip:\n", + "\n", + "```bash\n", + "pip install pyautogen[retrievechat-mongodb] flaml[automl]\n", + "```\n", + "\n", + "For more information, please refer to the [installation guide](/docs/installation/).\n", + ":::\n", + "````\n", + "\n", + "Ensure you have a MongoDB Atlas instance.\n", + "\n", + "If not, a test version can quickly be deployed using Docker.\n", + "\n", + "`docker-compose.yml`\n", + "\n", + "```yml\n", + "version: '3.9'\n", + "\n", + "services:\n", + " mongodb:\n", + " image: mongodb/mongodb-atlas-local:latest\n", + " restart: unless-stopped\n", + " ports:\n", + " - \"27017:27017\"\n", + " environment:\n", + " MONGODB_INITDB_ROOT_USERNAME: mongodb_user\n", + " MONGODB_INITDB_ROOT_PASSWORD: mongodb_password\n", + "```\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set your API Endpoint\n", + "\n", + "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "models to use: ['gpt-35-turbo']\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "\n", + "import autogen\n", + "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n", + "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", + "\n", + "# Accepted file formats for that can be stored in\n", + "# a vector database instance\n", + "from autogen.retrieve_utils import TEXT_FORMATS\n", + "\n", + "config_list = [\n", + " {\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": \"\", \"api_type\": \"openai\"},\n", + " {\n", + " \"model\": \"gpt-35-turbo\",\n", + " \"base_url\": \"\",\n", + " \"api_type\": \"azure\",\n", + " \"api_version\": \"2023-07-01-preview\",\n", + " \"api_key\": \"\",\n", + " },\n", + "]\n", + "assert len(config_list) > 0\n", + "print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "````{=mdx}\n", + ":::tip\n", + "Learn more about configuring LLMs for agents [here](/docs/topics/llm_configuration).\n", + ":::\n", + "````\n", + "\n", + "## Construct agents for RetrieveChat\n", + "\n", + "We start by initializing the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for RetrieveAssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.message_generator` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accepted file formats for `docs_path`:\n", + "['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n" + ] + } + ], + "source": [ + "print(\"Accepted file formats for `docs_path`:\")\n", + "print(TEXT_FORMATS)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", + "assistant = RetrieveAssistantAgent(\n", + " name=\"assistant\",\n", + " system_message=\"You are a helpful assistant.\",\n", + " llm_config={\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 42,\n", + " \"config_list\": config_list,\n", + " },\n", + ")\n", + "\n", + "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n", + "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n", + "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n", + "# it is set to None, which works only if the collection is already created.\n", + "# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n", + "# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n", + "# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.\n", + "# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.\n", + "# In this example, we set it to [\"non-existent-type\"] to only process markdown files. Since no \"non-existent-type\" files are included in the `websit/docs`,\n", + "# no files there will be processed. However, the explicitly included urls will still be processed.\n", + "ragproxyagent = RetrieveUserProxyAgent(\n", + " name=\"ragproxyagent\",\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=3,\n", + " retrieve_config={\n", + " \"task\": \"code\",\n", + " \"docs_path\": [\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n", + " os.path.join(os.path.abspath(\"\"), \"..\", \"website\", \"docs\"),\n", + " ],\n", + " \"custom_text_types\": [\"non-existent-type\"],\n", + " \"chunk_token_size\": 2000,\n", + " \"model\": config_list[0][\"model\"],\n", + " \"vector_db\": \"mongodb\", # MongoDB Atlas database\n", + " \"collection_name\": \"flaml_collection_two\",\n", + " \"index_name\": \"flaml_index_two\",\n", + " \"db_config\": {\n", + " \"connection_string\": \"\", # MongoDB Atlas connection string\n", + " \"database_name\": \"test_db\", # MongoDB Atlas database\n", + " },\n", + " \"get_or_create\": False, # set to False if you don't want to reuse an existing collection\n", + " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", + " },\n", + " code_execution_config=False, # set to False if you don't want to execute the code\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 1\n", + "\n", + "[Back to top](#table-of-contents)\n", + "\n", + "Use RetrieveChat to help generate sample code and automatically run the code and fix errors if there is any.\n", + "\n", + "Problem: Which API should I use if I want to use FLAML for a classification task and I want to train the model in 30 seconds. Use spark to parallel the training. Force cancel jobs if time limit is reached." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VectorDB returns doc_ids: [[]]\n", + "\u001b[32mNo more context, will terminate.\u001b[0m\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "# reset the assistant. Always reset the assistant before starting a new conversation.\n", + "assistant.reset()\n", + "\n", + "# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.\n", + "# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.\n", + "# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.\n", + "# With human-in-loop, the conversation will continue until the user says \"exit\".\n", + "code_problem = \"How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\"\n", + "chat_result = ragproxyagent.initiate_chat(\n", + " assistant, message=ragproxyagent.message_generator, problem=code_problem, search_string=\"spark\"\n", + ") # search_string is used as an extra filter for the embeddings search, in this case, we only want to search documents that contain \"spark\"." + ] + } + ], + "metadata": { + "front_matter": { + "description": "Explore the use of AutoGen's RetrieveChat for tasks like code generation from docstrings, answering complex questions with human feedback, and exploiting features like Update Context, custom prompts, and few-shot learning.", + "tags": [ + "RAG" + ] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "skip_test": "Requires interactive usage" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/setup.py b/setup.py index 738e09d9061c..900df9ce2201 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,7 @@ "mathchat": ["sympy", "pydantic==1.10.9", "wolframalpha"], "retrievechat": retrieve_chat, "retrievechat-pgvector": retrieve_chat_pgvector, + "retrievechat-mongodb": [*retrieve_chat, "pymongo>=4.0.0"], "retrievechat-qdrant": [ *retrieve_chat, "qdrant_client[fastembed]<1.9.2", diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py new file mode 100644 index 000000000000..a9f899e30131 --- /dev/null +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -0,0 +1,135 @@ +import os +import sys +import time +import urllib.parse + +import pytest +from conftest import reason + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +try: + import sentence_transformers + + from autogen.agentchat.contrib.vectordb.mongodb import MongoDBAtlasVectorDB +except ImportError: + skip = True +else: + skip = False + +reason = "do not run on MacOS or windows OR dependency is not installed OR " + reason + + +@pytest.mark.skipif( + sys.platform in ["darwin", "win32"] or skip, + reason=reason, +) +def test_mongodb(): + # test db config + db_config = { + "connection_string": "mongodb://mongodb_user:mongodb_password@localhost:27017/database_name", + } + + # test create collection with connection_string authentication + db = MongoDBAtlasVectorDB( + connection_string=db_config["connection_string"], + ) + collection_name = "test_collection" + """ + def create_collection(collection_name: str, + overwrite: bool = False, + get_or_create: bool = True) -> Any + Create a collection in the vector database. + - Case 1. if the collection does not exist, create the collection. + - Case 2. the collection exists, if overwrite is True, it will overwrite the collection. + - Case 3. the collection exists and overwrite is False, if get_or_create is True, it will get the collection, otherwise it raise a ValueError. + """ + # test_create_collection: case 1 + if collection_name not in db.list_collections(): + collection = db.create_collection( + collection_name=collection_name, + index_name="my_index", + similarity="cosine", + overwrite=False, + get_or_create=True, + ) + assert collection.name == collection_name + # test_create_collection: case 2 + # test overwrite=True + collection = db.create_collection( + collection_name=collection_name, + index_name="my_index_1", + similarity="cosine", + overwrite=True, + get_or_create=True, + ) + assert collection.name == collection_name + + # test_create_collection: case 3 + # test overwrite=False + # test get_or_create=False + with pytest.raises(ValueError): + collection = db.create_collection( + collection_name, index_name="my_index_1", similarity="cosine", overwrite=False, get_or_create=False + ) + # test get_or_create=True + collection = db.create_collection( + collection_name, index_name="my_index_1", similarity="cosine", overwrite=False, get_or_create=True + ) + assert collection.name == collection_name + + # test_get_collection + collection = db.get_collection(collection_name) + assert collection.name == collection_name + + # test_insert_docs + docs = [{"content": "doc1", "id": "1"}, {"content": "doc2", "id": "2"}, {"content": "doc3", "id": "3"}] + db.insert_docs(docs, collection_name, upsert=False) + res = list(db.get_collection(collection_name).find({"id": {"$in": ["1", "2"]}})) + final_results = [result.get("content") for result in res] + assert final_results == ["doc1", "doc2"] + + # test_update_docs + docs = [{"content": "doc11", "id": "1"}, {"content": "doc2", "id": "2"}, {"content": "doc3", "id": "3"}] + db.update_docs(docs, collection_name) + res = list(db.get_collection(collection_name).find({"id": {"$in": ["1", "2"]}})) + final_results = [result.get("content") for result in res] + assert final_results == ["doc11", "doc2"] + + # test_delete_docs + ids = ["1"] + db.delete_docs(ids, collection_name) + res = list(db.get_collection(collection_name).find({"id": {"$in": ids}})) + final_results = [result.get("content") for result in res] + assert final_results == [] + + # sleep for a few seconds -- make sure vector search index is ready + time.sleep(30) + # test_retrieve_docs + """ + [[({'content': 'doc2', 'id': '2'}, 0.0), + ({'content': 'doc3', 'id': '3'}, 0.08)], + [({'content': 'doc3', 'id': '3'}, 0.0), + ({'content': 'doc2', 'id': '2'}, 0.08)]] + """ + queries = ["doc2", "doc3"] + res = db.retrieve_docs(queries=queries, collection_name=collection_name, index_name="my_index_1") + assert [[r[0]["id"] for r in rr] for rr in res] == [["2", "3"], ["3", "2"]] + res = db.retrieve_docs( + queries=queries, collection_name=collection_name, distance_threshold=0.05, index_name="my_index_1" + ) + assert [[r[0]["id"] for r in rr] for rr in res] == [["2"], ["3"]] + # test_get_docs_by_ids + res = db.get_docs_by_ids(["1", "2"], collection_name) + assert [r["id"] for r in res] == ["2"] # "1" has been deleted + res = db.get_docs_by_ids(collection_name=collection_name) + assert set([r["id"] for r in res]) == set(["2", "3"]) # All Docs returned + + # test_delete_collection + db.delete_collection(collection_name) + # check if the collection is deleted + pytest.raises(ValueError, db.get_collection, collection_name) + + +if __name__ == "__main__": + test_mongodb() From 383168ee91fb2787feac8a1373392c2aa17380b0 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Sat, 22 Jun 2024 08:14:38 -0400 Subject: [PATCH 02/33] Update test/agentchat/contrib/vectordb/test_mongodb.py Co-authored-by: HRUSHIKESH DOKALA <96101829+Hk669@users.noreply.github.com> --- test/agentchat/contrib/vectordb/test_mongodb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index a9f899e30131..d0ef4334b293 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -63,7 +63,7 @@ def create_collection(collection_name: str, overwrite=True, get_or_create=True, ) - assert collection.name == collection_name + assert collection.collection_name == collection_name # test_create_collection: case 3 # test overwrite=False From f531568c21dc19a7b89dea0c17952f0051766664 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Sat, 22 Jun 2024 09:30:02 -0400 Subject: [PATCH 03/33] update test_mongodb.py; we dont need to do the assert .collection_name vs .name --- test/agentchat/contrib/vectordb/test_mongodb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index d0ef4334b293..04e847414b4c 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -54,6 +54,7 @@ def create_collection(collection_name: str, get_or_create=True, ) assert collection.name == collection_name + # test_create_collection: case 2 # test overwrite=True collection = db.create_collection( @@ -63,7 +64,7 @@ def create_collection(collection_name: str, overwrite=True, get_or_create=True, ) - assert collection.collection_name == collection_name + assert collection.name == collection_name # test_create_collection: case 3 # test overwrite=False From a1c385c99afb3d07858d010dc05e1ce74579013f Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sat, 22 Jun 2024 22:17:37 +0800 Subject: [PATCH 04/33] Try fix mongodb service --- .github/workflows/contrib-tests.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index 6c02275c6027..e7f3f4a063dc 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -85,14 +85,13 @@ jobs: --health-retries 5 ports: - 5432:5432 - mongodb: - image: mongodb/mongodb-atlas-local:latest - restart: unless-stopped - ports: - - "27017:27017" - environment: - MONGODB_INITDB_ROOT_USERNAME: mongodb_user - MONGODB_INITDB_ROOT_PASSWORD: mongodb_password + mongodb: + image: mongodb/mongodb-atlas-local:latest + ports: + - "27017:27017" + environment: + MONGODB_INITDB_ROOT_USERNAME: mongodb + MONGODB_INITDB_ROOT_PASSWORD: mongodb steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From bb1a18312377ae51ad93376495a66b2349943464 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sat, 22 Jun 2024 22:19:46 +0800 Subject: [PATCH 05/33] Try fix mongodb service --- .github/workflows/contrib-tests.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index e7f3f4a063dc..be0b3df2c075 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -85,13 +85,13 @@ jobs: --health-retries 5 ports: - 5432:5432 - mongodb: - image: mongodb/mongodb-atlas-local:latest - ports: - - "27017:27017" - environment: - MONGODB_INITDB_ROOT_USERNAME: mongodb - MONGODB_INITDB_ROOT_PASSWORD: mongodb + mongodb: + image: mongodb/mongodb-atlas-local:latest + ports: + - 27017:27017 + env: + MONGODB_INITDB_ROOT_USERNAME: mongodb + MONGODB_INITDB_ROOT_PASSWORD: mongodb steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From 1e30425bcd8c46c35d328d10465ffc98bdf11fd1 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sat, 22 Jun 2024 22:22:39 +0800 Subject: [PATCH 06/33] Update username and password --- .github/workflows/contrib-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index be0b3df2c075..3de0d051c50f 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -90,8 +90,8 @@ jobs: ports: - 27017:27017 env: - MONGODB_INITDB_ROOT_USERNAME: mongodb - MONGODB_INITDB_ROOT_PASSWORD: mongodb + MONGODB_INITDB_ROOT_USERNAME: mongodb_user + MONGODB_INITDB_ROOT_PASSWORD: mongodb_password steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From e0f3c593bed987d1492f87e3688ff96d4a3f2351 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sat, 22 Jun 2024 23:11:02 +0800 Subject: [PATCH 07/33] Update autogen/agentchat/contrib/vectordb/mongodb.py --- autogen/agentchat/contrib/vectordb/mongodb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 0b537fd30f88..7e39e35447a0 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -84,6 +84,7 @@ def create_collection( collection_names = self.db.list_collection_names() if overwrite and collection_name in collection_names: self.db.drop_collection(collection_name) + collection_names = self.db.list_collection_names() # update collection names # If get_or_create is True and the collection already exists, return the existing collection if get_or_create and collection_name in collection_names: return self.db[collection_name] From d6a1162e9365bf7fdea0b8723c38627f66d08e24 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Sat, 22 Jun 2024 21:34:21 -0400 Subject: [PATCH 08/33] closer --- but im not super thrilled about the solution... --- autogen/agentchat/contrib/vectordb/mongodb.py | 66 +++- notebook/agentchat_mongodb_RetrieveChat.ipynb | 287 +++++++++++++++++- .../contrib/vectordb/test_mongodb.py | 18 +- 3 files changed, 334 insertions(+), 37 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 7e39e35447a0..4290a751714d 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -1,3 +1,4 @@ +import time from typing import Callable, List, Literal import numpy as np @@ -21,6 +22,8 @@ def __init__( connection_string: str = "", database_name: str = "vector_db", embedding_function: Callable = SentenceTransformer("all-MiniLM-L6-v2").encode, + index_name: str = "default_index", + similarity: Literal["euclidean", "cosine", "dotProduct"] = "cosine", ): """ Initialize the vector database. @@ -46,6 +49,10 @@ def __init__( ] embeddings = self.embedding_function(sentences) self.dimensions = len(embeddings[0]) + # index lookup + self.database_name = database_name + self.index_name = index_name + self.similarity = similarity def list_collections(self): """ @@ -59,14 +66,7 @@ def list_collections(self): except Exception as err: raise err - def create_collection( - self, - collection_name: str, - overwrite: bool = False, - get_or_create: bool = True, - index_name: str = "default_index", - similarity: Literal["euclidean", "cosine", "dotProduct"] = "cosine", - ): + def create_collection(self, collection_name: str, overwrite: bool = False, get_or_create: bool = True): """ Create a collection in the vector database and create a vector search index in the collection. @@ -83,6 +83,16 @@ def create_collection( # If overwrite is True and the collection already exists, drop the existing collection collection_names = self.db.list_collection_names() if overwrite and collection_name in collection_names: + collection = self.db[collection_name] + collection.drop_search_index(self.index_name) + pleaseWait = True + print(f"Waiting for index: {self.index_name} on {collection_name} to be dropped.") + while pleaseWait: + current_index_status = collection.list_search_indexes() + pleaseWait = False + for index in current_index_status: + if index["name"] == self.index_name: + pleaseWait = True self.db.drop_collection(collection_name) collection_names = self.db.list_collection_names() # update collection names # If get_or_create is True and the collection already exists, return the existing collection @@ -98,15 +108,30 @@ def create_collection( search_index_model = SearchIndexModel( definition={ "fields": [ - {"type": "vector", "numDimensions": self.dimensions, "path": "embedding", "similarity": similarity}, + { + "type": "vector", + "numDimensions": self.dimensions, + "path": "embedding", + "similarity": self.similarity, + }, ] }, - name=index_name, + name=self.index_name, type="vectorSearch", ) # Create the search index try: collection.create_search_index(model=search_index_model) + # wait until the search_index is 'ready' before returning collection + pleaseWait = True + print("Creating index on " + collection_name + ". Let's wait for the index to be READY.") + while pleaseWait: + current_index_status = collection.list_search_indexes() + for index in current_index_status: + if index["name"] == self.index_name: + if str(index["status"]).lower() == "ready": + pleaseWait = False + print(f"{self.index_name} on {collection_name} is READY.") return collection except Exception as e: logger.error(f"Error creating search index: {e}") @@ -217,7 +242,6 @@ def retrieve_docs( collection_name: str = None, n_results: int = 10, distance_threshold: float = -1, - index_name: str = "default", **kwargs, ) -> QueryResults: """ @@ -239,11 +263,11 @@ def retrieve_docs( for query_text in queries: query_vector = np.array(self.embedding_function([query_text])).tolist()[0] # Find documents with similar vectors using the specified index - search_collection = self.get_collection(collection_name) + search_collection = self.db[collection_name] pipeline = [ { "$vectorSearch": { - "index": index_name, + "index": self.index_name, "limit": n_results, "numCandidates": n_results, "queryVector": query_vector, @@ -287,7 +311,21 @@ def retrieve_docs( ) pipeline.append({"$project": {"full_document_array": 0, "embedding": 0}}) tmp_results = [] - for doc in search_collection.aggregate(pipeline): + # lets check the status of the index + pleaseWait = True + while pleaseWait: + current_index_status = search_collection.list_search_indexes() + for index in current_index_status: + if index["name"] == self.index_name and str(index["status"]).lower() == "ready": + pleaseWait = False + print("index is ready to use.") + print(index) + # run the pipeline + time.sleep(15) # not sure why I need this :( + logger.debug(f"Now running pipeline: {pipeline}") + print(f"Now running pipeline: {pipeline}") + search_results = list(search_collection.aggregate(pipeline)) + for doc in search_results: tmp_results.append((doc["full_document"], 1 - doc["score"])) results.append(tmp_results) return results diff --git a/notebook/agentchat_mongodb_RetrieveChat.ipynb b/notebook/agentchat_mongodb_RetrieveChat.ipynb index 2f49288bf52e..aca2f1174921 100644 --- a/notebook/agentchat_mongodb_RetrieveChat.ipynb +++ b/notebook/agentchat_mongodb_RetrieveChat.ipynb @@ -70,7 +70,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "models to use: ['gpt-35-turbo']\n" + "models to use: ['gpt-3.5-turbo-0125', 'gpt-35-turbo']\n" ] } ], @@ -137,9 +137,24 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 06-22 21:19:19] {150} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-22 21:19:19,131 WARNING autogen.oai.client: The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" + ] + } + ], "source": [ "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", "assistant = RetrieveAssistantAgent(\n", @@ -177,13 +192,12 @@ " \"chunk_token_size\": 2000,\n", " \"model\": config_list[0][\"model\"],\n", " \"vector_db\": \"mongodb\", # MongoDB Atlas database\n", - " \"collection_name\": \"flaml_collection_two\",\n", - " \"index_name\": \"flaml_index_two\",\n", + " \"collection_name\": \"flaml_collection\",\n", " \"db_config\": {\n", - " \"connection_string\": \"\", # MongoDB Atlas connection string\n", + " \"connection_string\": \"\", # MongoDB Atlas connection string\n", " \"database_name\": \"test_db\", # MongoDB Atlas database\n", " },\n", - " \"get_or_create\": False, # set to False if you don't want to reuse an existing collection\n", + " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", @@ -206,14 +220,269 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "VectorDB returns doc_ids: [[]]\n", + "Trying to create collection.\n", + "Creating index on flaml_collection. Let's wait for the index to be READY.\n", + "default_index on flaml_collection is READY.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-22 21:19:45,413 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "index is ready to use.\n", + "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", + "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 20, 'numCandidates': 20, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", + "VectorDB returns doc_ids: [['bdfbc921']]\n", + "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", + "context provided by the user.\n", + "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", + "For code generation, you must obey the following rules:\n", + "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", + "Rule 2. You must follow the formats below to write your code:\n", + "```language\n", + "# your code\n", + "```\n", + "\n", + "User's question is: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "\n", + "Context is: # Integrate - Spark\n", + "\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", + "\n", + "## Spark ML Estimators\n", + "\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", + "\n", + "### Data\n", + "\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", + "\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "\n", + "Here is an example code snippet for Spark Data:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "\n", + "# Creating a dictionary\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000],\n", + "}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", + "\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "\n", + "Here is an example of how to use it:\n", + "\n", + "```python\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "```\n", + "\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", + "\n", + "### Estimators\n", + "\n", + "#### Model List\n", + "\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", + "\n", + "#### Usage\n", + "\n", + "First, prepare your data in the required format as described in the previous section.\n", + "\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", + "\n", + "Here is an example code snippet using SparkML models in AutoML:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", + "\n", + "## Parallel Spark Jobs\n", + "\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", + "\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", + "\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", + "\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "\n", + "An example code snippet for using parallel Spark jobs:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "To perform a classification task using FLAML and Spark for parallel training, you can use the `lgbm_spark` estimator and activate Spark as the parallel backend during parallel tuning by setting the `use_spark` to `true`. You can also train for 30 seconds and force cancel jobs if the time limit is reached by setting `time_budget` and `force_cancel` in the `automl_settings`. Here's an example code snippet to perform classification:\n", + "\n", + "```python\n", + "import flaml\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "# Prepare your data in pandas-on-spark format\n", + "data = {\n", + " \"feature1\": [0,1,1,0,1],\n", + " \"feature2\": [1,0,1,1,0],\n", + " \"feature3\": [0,1,0,1,1],\n", + " \"class\": [0,0,1,0,1],\n", + "}\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"class\"\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "\n", + "# Use FLAML to perform classification with Spark for parallel training\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30, # Set time budget to 30 seconds\n", + " \"metric\": \"accuracy\", # Use accuracy metric for classification\n", + " \"task\": \"classification\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # Use lgbm_spark estimator for classification\n", + " \"n_concurrent_trials\": 2, # Set number of concurrent trials to 2\n", + " \"use_spark\": True, # Activate Spark as parallel backend\n", + " \"force_cancel\": True, # Force cancel jobs if time limit is reached\n", + "}\n", + "\n", + "automl_experiment.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "UPDATE CONTEXT\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", + "index is ready to use.\n", + "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", + "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 60, 'numCandidates': 60, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", + "VectorDB returns doc_ids: [['bdfbc921']]\n", + "index is ready to use.\n", + "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", + "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 100, 'numCandidates': 100, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", + "VectorDB returns doc_ids: [['bdfbc921']]\n", + "index is ready to use.\n", + "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", + "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 140, 'numCandidates': 140, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", + "VectorDB returns doc_ids: [['bdfbc921']]\n", + "index is ready to use.\n", + "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", + "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 180, 'numCandidates': 180, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", + "VectorDB returns doc_ids: [['bdfbc921']]\n", "\u001b[32mNo more context, will terminate.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 04e847414b4c..2cb669fa4cf4 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -48,8 +48,6 @@ def create_collection(collection_name: str, if collection_name not in db.list_collections(): collection = db.create_collection( collection_name=collection_name, - index_name="my_index", - similarity="cosine", overwrite=False, get_or_create=True, ) @@ -59,8 +57,6 @@ def create_collection(collection_name: str, # test overwrite=True collection = db.create_collection( collection_name=collection_name, - index_name="my_index_1", - similarity="cosine", overwrite=True, get_or_create=True, ) @@ -70,13 +66,9 @@ def create_collection(collection_name: str, # test overwrite=False # test get_or_create=False with pytest.raises(ValueError): - collection = db.create_collection( - collection_name, index_name="my_index_1", similarity="cosine", overwrite=False, get_or_create=False - ) + collection = db.create_collection(collection_name, overwrite=False, get_or_create=False) # test get_or_create=True - collection = db.create_collection( - collection_name, index_name="my_index_1", similarity="cosine", overwrite=False, get_or_create=True - ) + collection = db.create_collection(collection_name, overwrite=False, get_or_create=True) assert collection.name == collection_name # test_get_collection @@ -114,11 +106,9 @@ def create_collection(collection_name: str, ({'content': 'doc2', 'id': '2'}, 0.08)]] """ queries = ["doc2", "doc3"] - res = db.retrieve_docs(queries=queries, collection_name=collection_name, index_name="my_index_1") + res = db.retrieve_docs(queries=queries, collection_name=collection_name) assert [[r[0]["id"] for r in rr] for rr in res] == [["2", "3"], ["3", "2"]] - res = db.retrieve_docs( - queries=queries, collection_name=collection_name, distance_threshold=0.05, index_name="my_index_1" - ) + res = db.retrieve_docs(queries=queries, collection_name=collection_name, distance_threshold=0.05) assert [[r[0]["id"] for r in rr] for rr in res] == [["2"], ["3"]] # test_get_docs_by_ids res = db.get_docs_by_ids(["1", "2"], collection_name) From de48057c7f2e66459a7ba2a10ae76702bae31899 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 24 Jun 2024 10:53:21 -0400 Subject: [PATCH 09/33] PYTHON-4506 Expanded tests and simplified vector search pipelines --- autogen/agentchat/contrib/vectordb/base.py | 2 +- autogen/agentchat/contrib/vectordb/mongodb.py | 415 ++++++++++++------ .../contrib/vectordb/test_mongodb.py | 344 +++++++++++---- 3 files changed, 544 insertions(+), 217 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/base.py b/autogen/agentchat/contrib/vectordb/base.py index 5ce666a9d9e3..41b8f143ab83 100644 --- a/autogen/agentchat/contrib/vectordb/base.py +++ b/autogen/agentchat/contrib/vectordb/base.py @@ -171,7 +171,7 @@ def get_docs_by_ids( ids: List[ItemID] | A list of document ids. If None, will return all the documents. Default is None. collection_name: str | The name of the collection. Default is None. include: List[str] | The fields to include. Default is None. - If None, will include ["metadatas", "documents"], ids will always be included. + If None, will include ["metadata", "content"], ids will always be included. # TODO - Confirm keys kwargs: dict | Additional keyword arguments. Returns: diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 0b537fd30f88..471e4b1d9912 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -1,7 +1,10 @@ -from typing import Callable, List, Literal +from copy import deepcopy +from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Set, Tuple, Union import numpy as np -from pymongo import MongoClient, errors +from pymongo import MongoClient, UpdateOne, errors +from pymongo.collection import Collection +from pymongo.cursor import Cursor from pymongo.operations import SearchIndexModel from sentence_transformers import SentenceTransformer @@ -10,6 +13,13 @@ logger = get_logger(__name__) +DEFAULT_INSERT_BATCH_SIZE = 100_000 + + +def with_id_rename(docs: Iterable) -> List[Dict[str, Any]]: + """Utility changes _id field from Collection into id for Document.""" + return [{**{k: v for k, v in d.items() if k != "_id"}, "id": d["_id"]} for d in docs] + class MongoDBAtlasVectorDB(VectorDB): """ @@ -21,6 +31,7 @@ def __init__( connection_string: str = "", database_name: str = "vector_db", embedding_function: Callable = SentenceTransformer("all-MiniLM-L6-v2").encode, + collection_name: str = None, ): """ Initialize the vector database. @@ -30,20 +41,22 @@ def __init__( database_name: str | The name of the database. Default is 'vector_db'. embedding_function: The embedding function used to generate the vector representation. """ - if embedding_function: - self.embedding_function = embedding_function + self.embedding_function = embedding_function try: self.client = MongoClient(connection_string) self.client.admin.command("ping") + logger.info("Successfully created MongoClient") except errors.ServerSelectionTimeoutError as err: raise ConnectionError("Could not connect to MongoDB server") from err self.db = self.client[database_name] - self.active_collection = None + logger.info(f"Atlas Database name: {self.db.name}") + if collection_name: + self.active_collection = self.create_collection(collection_name) + else: + self.active_collection = None # This will get the model dimension size by computing the embeddings dimensions - sentences = [ - "The weather is lovely today in paradise.", - ] + sentences = ["The weather is lovely today in paradise."] embeddings = self.embedding_function(sentences) self.dimensions = len(embeddings[0]) @@ -54,19 +67,14 @@ def list_collections(self): Returns: List[str] | The list of collections. """ - try: - return self.db.list_collection_names() - except Exception as err: - raise err + return self.db.list_collection_names() def create_collection( self, collection_name: str, overwrite: bool = False, get_or_create: bool = True, - index_name: str = "default_index", - similarity: Literal["euclidean", "cosine", "dotProduct"] = "cosine", - ): + ) -> Collection: """ Create a collection in the vector database and create a vector search index in the collection. @@ -80,38 +88,23 @@ def create_collection( # if overwrite is False and get_or_create is False, raise a ValueError if not overwrite and not get_or_create: raise ValueError("If overwrite is False, get_or_create must be True.") - # If overwrite is True and the collection already exists, drop the existing collection + collection_names = self.db.list_collection_names() - if overwrite and collection_name in collection_names: + if collection_name not in collection_names: + # Create a new collection + return self.db.create_collection(collection_name) + + if overwrite: self.db.drop_collection(collection_name) - # If get_or_create is True and the collection already exists, return the existing collection - if get_or_create and collection_name in collection_names: + + if get_or_create: + # The collection already exists, return it. return self.db[collection_name] - # If get_or_create is False and the collection already exists, raise a ValueError - if not get_or_create and collection_name in collection_names: + else: + # get_or_create is False and the collection already exists, raise an error. raise ValueError(f"Collection {collection_name} already exists.") - # Create a new collection - collection = self.db.create_collection(collection_name) - # Create a vector search index in the collection - search_index_model = SearchIndexModel( - definition={ - "fields": [ - {"type": "vector", "numDimensions": self.dimensions, "path": "embedding", "similarity": similarity}, - ] - }, - name=index_name, - type="vectorSearch", - ) - # Create the search index - try: - collection.create_search_index(model=search_index_model) - return collection - except Exception as e: - logger.error(f"Error creating search index: {e}") - raise e - - def get_collection(self, collection_name: str = None): + def get_collection(self, collection_name: str = None) -> Collection: """ Get the collection from the vector database. @@ -130,13 +123,11 @@ def get_collection(self, collection_name: str = None): f"No collection is specified. Using current active collection {self.active_collection.name}." ) else: - if collection_name not in self.list_collections(): - raise ValueError(f"Collection {collection_name} does not exist.") - if self.active_collection is None: - self.active_collection = self.db[collection_name] + self.active_collection = self.db[collection_name] + return self.active_collection - def delete_collection(self, collection_name: str): + def delete_collection(self, collection_name: str) -> None: """ Delete the collection from the vector database. @@ -145,42 +136,190 @@ def delete_collection(self, collection_name: str): """ return self.db[collection_name].drop() - def insert_docs(self, docs: List[Document], collection_name: str = None, upsert: bool = False): + def create_vector_search_index( + self, + collection: Collection, + index_name: Union[str, None] = "default_index", + similarity: Literal["euclidean", "cosine", "dotProduct"] = "cosine", + ) -> None: + """Create a vector search index in the collection. + + Args: + collection: An existing Collection in the Atlas Database. + index_name: Vector Search Index name. + similarity: Algorithm used for measuring vector similarity. + kwargs: Additional keyword arguments. + + Returns: + None """ - Insert documents into the collection of the vector database. + search_index_model = SearchIndexModel( + definition={ + "fields": [ + { + "type": "vector", + "numDimensions": self.dimensions, + "path": "embedding", + "similarity": similarity, + }, + ] + }, + name=index_name, + type="vectorSearch", + ) + # Create the search index + try: + collection.create_search_index(model=search_index_model) + except Exception as e: + logger.error( + f"Error creating search index: {e}. \n" + f"Your client must be connected to an Atlas cluster. " + f"You may have to manually create a Collection and Search Index " + f"if you are on a free/shared cluster." + ) + raise e + + def insert_docs( + self, + docs: List[Document], + collection_name: str = None, + upsert: bool = False, + batch_size=DEFAULT_INSERT_BATCH_SIZE, + **kwargs, + ) -> None: + """Insert Documents and Vector Embeddings into the collection of the vector database. + + For large numbers of Documents, insertion is performed in batches. Args: docs: List[Document] | A list of documents. Each document is a TypedDict `Document`. collection_name: str | The name of the collection. Default is None. upsert: bool | Whether to update the document if it exists. Default is False. + batch_size: Number of documents to be inserted in each batch """ if not docs: + logger.info("No documents to insert.") return + + if upsert: + raise ValueError( + "If attempting to upsert, please use update_docs with upsert=True. " + "upsert is a function of update in MongoDB. " + "It is most efficient if done there." + ) + + collection = self.get_collection(collection_name) + + # Sanity checking the first document if docs[0].get("content") is None: raise ValueError("The document content is required.") if docs[0].get("id") is None: raise ValueError("The document id is required.") - collection = self.get_collection(collection_name) + + input_ids = set() + result_ids = set() + id_batch = [] + text_batch = [] + metadata_batch = [] + size = 0 + i = 0 for doc in docs: - if "embedding" not in doc: - doc["embedding"] = np.array(self.embedding_function([str(doc["content"])])).tolist()[0] - if upsert: - for doc in docs: - return collection.replace_one({"id": doc["id"]}, doc, upsert=True) - else: - return collection.insert_many(docs) + id = doc["id"] + text = doc["content"] + metadata = doc.get("metadata", {}) + id_batch.append(id) + text_batch.append(text) + metadata_batch.append(metadata) + size += len(text) + len(metadata) # todo consider len(id) when str|int + if (i + 1) % batch_size == 0 or size >= 47_000_000: + result_ids.update(self._insert_batch(collection, text_batch, metadata_batch, id_batch)) + input_ids.update(id_batch) + id_batch = [] + text_batch = [] + metadata_batch = [] + size = 0 + i += 1 + if text_batch: + result_ids.update(self._insert_batch(collection, text_batch, metadata_batch, id_batch)) # type: ignore + input_ids.update(id_batch) + + if result_ids != input_ids: + logger.warning( + "Possible data corruption. " + "input_ids not in result_ids: {in_diff}.\n" + "result_ids not in input_ids: {out_diff}".format( + in_diff=input_ids.difference(result_ids), out_diff=result_ids.difference(input_ids) + ) + ) + + def _insert_batch( + self, collection: Collection, texts: List[str], metadatas: List[Mapping[str, Any]], ids: List[ItemID] + ) -> Set[ItemID]: + """Compute embeddings for and insert a batch of Documents into the Collection. - def update_docs(self, docs: List[Document], collection_name: str = None): + For performance reasons, we chose to call self.embedding_function just once, + with the hopefully small tradeoff of having recreating Document dicts. + + Args: + collection: MongoDB Collection + texts: List of the main contents of each document + metadatas: List of metadata mappings + ids: List of ids. Note that these are stored as _id in Collection. + + Returns: + List of ids inserted. """ - Update documents in the collection of the vector database. + n_texts = len(texts) + if n_texts == 0: + return [] + # Embed and create the documents + embeddings = self.embedding_function(texts).tolist() + assert ( + len(embeddings) == n_texts + ), f"The number of embeddings produced by self.embedding_function ({len(embeddings)} does not match the number of texts provided to it ({n_texts})." + to_insert = [ + {"_id": i, "content": t, "metadata": m, "embedding": e} + for i, t, m, e in zip(ids, texts, metadatas, embeddings) + ] + # insert the documents in MongoDB Atlas + insert_result = collection.insert_many(to_insert) # type: ignore + return insert_result.inserted_ids + + def update_docs(self, docs: List[Document], collection_name: str = None, **kwargs: Any) -> None: + """Update documents, including their embeddings, in the Collection. + + Optionally allow upsert as kwarg. + + Uses deepcopy to avoid changing docs. Args: docs: List[Document] | A list of documents. collection_name: str | The name of the collection. Default is None. + kwargs: Any | Use upsert=True` to insert documents whose ids are not present in collection. """ - return self.insert_docs(docs, collection_name, upsert=True) - def delete_docs(self, ids: List[ItemID], collection_name: str = None): + n_docs = len(docs) + logger.info(f"Preparing to embed and update {n_docs=}") + # Compute the embeddings + embeddings = self.embedding_function([doc["content"] for doc in docs]).tolist() + # Prepare the updates + all_updates = [] + for i in range(n_docs): + doc = deepcopy(docs[i]) + doc["embedding"] = embeddings + doc["_id"] = doc.pop("id") + + all_updates.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=kwargs.get("upsert", False))) + # Perform update in bulk + collection = self.get_collection(collection_name) + result = collection.bulk_write(all_updates) + + # Log a result summary + logger.info(f"Matched: {result.matched_count}") + logger.info(f"Modified: {result.modified_count}") + logger.info(f"Upserted: {result.upserted_count}") + + def delete_docs(self, ids: List[ItemID], collection_name: str = None, **kwargs): """ Delete documents from the collection of the vector database. @@ -189,26 +328,34 @@ def delete_docs(self, ids: List[ItemID], collection_name: str = None): collection_name: str | The name of the collection. Default is None. """ collection = self.get_collection(collection_name) - return collection.delete_many({"id": {"$in": ids}}) + return collection.delete_many({"_id": {"$in": ids}}) - def get_docs_by_ids(self, ids: List[ItemID] = None, collection_name: str = None): + def get_docs_by_ids( + self, ids: List[ItemID] = None, collection_name: str = None, include: List[str] = None, **kwargs + ) -> List[Document]: """ Retrieve documents from the collection of the vector database based on the ids. Args: ids: List[ItemID] | A list of document ids. If None, will return all the documents. Default is None. collection_name: str | The name of the collection. Default is None. + include: List[str] | The fields to include. + If None, will include ["metadata", "content"], ids will always be included. + Basically, use include to choose whether to include embedding and metadata + kwargs: dict | Additional keyword arguments. + + Returns: + List[Document] | The results. """ - results = [] - if ids is None: - collection = self.get_collection(collection_name) - results = list(collection.find({}, {"embedding": 0})) + if include is None: + include_fields = {"_id": 1, "content": 1, "metadata": 1} else: - for id in ids: - id = str(id) - collection = self.get_collection(collection_name) - results = list(collection.find({"id": {"$in": ids}}, {"embedding": 0})) - return results + include_fields = {k: 1 for k in set(include).union({"_id"})} + + collection = self.get_collection(collection_name) + docs = collection.find({"_id": {"$in": ids}}, include_fields) + # Return with _id field from Collection into id for Document + return with_id_rename(docs) def retrieve_docs( self, @@ -228,65 +375,85 @@ def retrieve_docs( n_results: int | The number of relevant documents to return. Default is 10. distance_threshold: float | The threshold for the distance score, only distance smaller than it will be returned. Don't filter with it if < 0. Default is -1. - kwargs: Dict | Additional keyword arguments. + kwargs: Dict | Additional keyword arguments. Ones of importance follow: + oversampling_factor: int | This times n_results is 'ef' in the HNSW algorithm. + It determines the number of nearest neighbor candidates to consider during the search phase. + A higher value leads to more accuracy, but is slower. Default = 10 Returns: - QueryResults | The query results. Each query result is a list of list of tuples containing the document and - the distance. + QueryResults | For each query string, a list of nearest documents and their scores. """ + + collection = self.get_collection(collection_name) + # Trivial case of an empty collection + if collection.count_documents({}) == 0: + return [] + + # Ensure that there is at least one search index + search_indexes = list(collection.list_search_indexes()) + assert len(search_indexes), f"There are no search indexes for {collection.name}" + results = [] for query_text in queries: + # Compute embedding vector from semantic query query_vector = np.array(self.embedding_function([query_text])).tolist()[0] # Find documents with similar vectors using the specified index - search_collection = self.get_collection(collection_name) - pipeline = [ - { - "$vectorSearch": { - "index": index_name, - "limit": n_results, - "numCandidates": n_results, - "queryVector": query_vector, - "path": "embedding", - } - }, - {"$project": {"score": {"$meta": "vectorSearchScore"}}}, - ] - if distance_threshold >= 0.00: - similarity_threshold = 1 - distance_threshold - pipeline.append({"$match": {"score": {"gte": similarity_threshold}}}) - - # do a lookup on the same collection - pipeline.append( - { - "$lookup": { - "from": collection_name, - "localField": "_id", - "foreignField": "_id", - "as": "full_document_array", - } - } + query_result = _vector_search( + query_vector, + n_results, + collection, + index_name, + distance_threshold, + kwargs.get("oversampling_factor", 10), ) - pipeline.append( - { - "$addFields": { - "full_document": { - "$arrayElemAt": [ - { - "$map": { - "input": "$full_document_array", - "as": "doc", - "in": {"id": "$$doc.id", "content": "$$doc.content"}, - } - }, - 0, - ] - } - } - } + # Change each _id key to id. with_id_rename, but with (doc, score) tuples + results.append( + [({**{k: v for k, v in d[0].items() if k != "_id"}, "id": d[0]["_id"]}, d[1]) for d in query_result] ) - pipeline.append({"$project": {"full_document_array": 0, "embedding": 0}}) - tmp_results = [] - for doc in search_collection.aggregate(pipeline): - tmp_results.append((doc["full_document"], 1 - doc["score"])) - results.append(tmp_results) return results + + +def _vector_search( + embedding_vector: List[float], + n_results: int, + collection: Collection, + index_name: str, + distance_threshold: float = -1.0, + oversampling_factor=10, +) -> List[Tuple[Dict, float]]: + """Core $vectorSearch Aggregation pipeline. + + Args: + embedding_vector: Embedding vector of semantic query + n_results: Number of documents to return. Defaults to 4. + collection: MongoDB Collection with vector index + index_name: Name of the vector index + distance_threshold: Only distance measures smaller than this will be returned. + Don't filter with it if < 0. Default is -1. + oversampling_factor: int | This times n_results is 'ef' in the HNSW algorithm. + It determines the number of nearest neighbor candidates to consider during the search phase. + A higher value leads to more accuracy, but is slower. Default = 10 + + Returns: + List of tuples of length n_results from Collection. + Each tuple contains a document dict and a score. + """ + + pipeline = [ + { + "$vectorSearch": { + "index": index_name, + "limit": n_results, + "numCandidates": n_results * oversampling_factor, + "queryVector": embedding_vector, + "path": "embedding", + } + }, + {"$set": {"score": {"$meta": "vectorSearchScore"}}}, + ] + if distance_threshold >= 0.0: + similarity_threshold = 1.0 - distance_threshold + pipeline.append({"$match": {"score": {"gte": similarity_threshold}}}) + + agg = collection.aggregate(pipeline) + return [(doc, doc.pop("score")) for doc in agg] diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 04e847414b4c..38914bc8436f 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -1,40 +1,85 @@ +import logging import os -import sys -import time -import urllib.parse +from time import sleep +from typing import List import pytest -from conftest import reason -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from autogen.agentchat.contrib.vectordb.base import Document try: + import pymongo import sentence_transformers from autogen.agentchat.contrib.vectordb.mongodb import MongoDBAtlasVectorDB except ImportError: - skip = True -else: - skip = False - -reason = "do not run on MacOS or windows OR dependency is not installed OR " + reason - - -@pytest.mark.skipif( - sys.platform in ["darwin", "win32"] or skip, - reason=reason, -) -def test_mongodb(): - # test db config - db_config = { - "connection_string": "mongodb://mongodb_user:mongodb_password@localhost:27017/database_name", - } - - # test create collection with connection_string authentication - db = MongoDBAtlasVectorDB( - connection_string=db_config["connection_string"], - ) - collection_name = "test_collection" + # To display warning in pyproject.toml [tool.pytest.ini_options] set log_cli = true + logger = logging.getLogger(__name__) + logger.warning(f"skipping {__name__}. It requires one to pip install pymongo or the extra [retrievechat-mongodb]") + pytest.skip(allow_module_level=True) + +from pymongo.collection import Collection +from pymongo.errors import OperationFailure + +logger = logging.getLogger(__name__) + +MONGODB_URI = os.environ.get("MONGODB_URI") +MONGODB_DATABASE = os.environ.get("DATABASE", "autogen_test_db") +MONGODB_COLLECTION = os.environ.get("MONGODB_COLLECTION", "autogen_test_vectorstore") +MONGODB_INDEX = os.environ.get("MONGODB_INDEX", "vector_index") + +RETRIES = 10 +DELAY = 2 + +if "MONGODB_URI" not in os.environ: + pytest.skip(allow_module_level=True) + + +@pytest.fixture +def db(): + """VectorDB setup and teardown, including collections and search indexes""" + vectorstore = MongoDBAtlasVectorDB(connection_string=MONGODB_URI, database_name=MONGODB_DATABASE) + vectorstore.delete_collection(MONGODB_COLLECTION) + yield vectorstore + for c in vectorstore.db.list_collection_names(): + clxn = vectorstore.get_collection(c) + clxn.drop() + sleep(20) # Provide time for resync of db and search services. + + +@pytest.fixture +def example_documents() -> List[Document]: + """Note mix of integers and strings as ids""" + return [ + Document(id=1, content="Dogs are tough.", metadata={"a": 1}), + Document(id=2, content="Cats have fluff.", metadata={"b": 1}), + Document(id="1", content="What is a sandwich?", metadata={"c": 1}), + Document(id="2", content="A sandwich makes a great lunch.", metadata={"d": 1, "e": 2}), + ] + + +@pytest.fixture +def db_with_indexed_clxn(db): + """Convenient when we wish to de-emphasize setup. + + We provide wait and retry method when running these quick integration tests. + """ + collection = db.create_collection(MONGODB_COLLECTION) + if MONGODB_INDEX not in collection.list_search_indexes(): + retries = 3 + delay = 3 + success = False + while retries and not success: + try: + db.create_vector_search_index(collection, MONGODB_INDEX) + success = True + except OperationFailure: + retries -= 1 + sleep(delay) + return db, collection + + +def test_create_collection(db): """ def create_collection(collection_name: str, overwrite: bool = False, @@ -44,23 +89,19 @@ def create_collection(collection_name: str, - Case 2. the collection exists, if overwrite is True, it will overwrite the collection. - Case 3. the collection exists and overwrite is False, if get_or_create is True, it will get the collection, otherwise it raise a ValueError. """ + collection_name = "test_collection" + # test_create_collection: case 1 + collection = db.create_collection( + collection_name=collection_name, + ) if collection_name not in db.list_collections(): - collection = db.create_collection( - collection_name=collection_name, - index_name="my_index", - similarity="cosine", - overwrite=False, - get_or_create=True, - ) assert collection.name == collection_name # test_create_collection: case 2 # test overwrite=True collection = db.create_collection( collection_name=collection_name, - index_name="my_index_1", - similarity="cosine", overwrite=True, get_or_create=True, ) @@ -70,67 +111,186 @@ def create_collection(collection_name: str, # test overwrite=False # test get_or_create=False with pytest.raises(ValueError): - collection = db.create_collection( - collection_name, index_name="my_index_1", similarity="cosine", overwrite=False, get_or_create=False - ) + collection = db.create_collection(collection_name, overwrite=False, get_or_create=False) # test get_or_create=True - collection = db.create_collection( - collection_name, index_name="my_index_1", similarity="cosine", overwrite=False, get_or_create=True - ) + collection = db.create_collection(collection_name, overwrite=False, get_or_create=True) assert collection.name == collection_name - # test_get_collection - collection = db.get_collection(collection_name) - assert collection.name == collection_name - # test_insert_docs - docs = [{"content": "doc1", "id": "1"}, {"content": "doc2", "id": "2"}, {"content": "doc3", "id": "3"}] - db.insert_docs(docs, collection_name, upsert=False) - res = list(db.get_collection(collection_name).find({"id": {"$in": ["1", "2"]}})) - final_results = [result.get("content") for result in res] - assert final_results == ["doc1", "doc2"] - - # test_update_docs - docs = [{"content": "doc11", "id": "1"}, {"content": "doc2", "id": "2"}, {"content": "doc3", "id": "3"}] - db.update_docs(docs, collection_name) - res = list(db.get_collection(collection_name).find({"id": {"$in": ["1", "2"]}})) - final_results = [result.get("content") for result in res] - assert final_results == ["doc11", "doc2"] - - # test_delete_docs - ids = ["1"] - db.delete_docs(ids, collection_name) - res = list(db.get_collection(collection_name).find({"id": {"$in": ids}})) - final_results = [result.get("content") for result in res] - assert final_results == [] - - # sleep for a few seconds -- make sure vector search index is ready - time.sleep(30) - # test_retrieve_docs - """ - [[({'content': 'doc2', 'id': '2'}, 0.0), - ({'content': 'doc3', 'id': '3'}, 0.08)], - [({'content': 'doc3', 'id': '3'}, 0.0), - ({'content': 'doc2', 'id': '2'}, 0.08)]] - """ - queries = ["doc2", "doc3"] - res = db.retrieve_docs(queries=queries, collection_name=collection_name, index_name="my_index_1") - assert [[r[0]["id"] for r in rr] for rr in res] == [["2", "3"], ["3", "2"]] - res = db.retrieve_docs( - queries=queries, collection_name=collection_name, distance_threshold=0.05, index_name="my_index_1" +def test_get_collection(db): + collection_name = MONGODB_COLLECTION + + with pytest.raises(ValueError): + db.get_collection() + + collection_created = db.create_collection(collection_name) + assert isinstance(collection_created, Collection) + assert collection_created.name == collection_name + + collection_got = db.get_collection(collection_name) + assert collection_got.name == collection_created.name + assert collection_got.name == db.active_collection.name + + +def test_delete_collection(db): + assert MONGODB_COLLECTION not in db.list_collections() + collection = db.create_collection(MONGODB_COLLECTION) + assert MONGODB_COLLECTION in db.list_collections() + db.delete_collection(collection.name) + assert MONGODB_COLLECTION not in db.list_collections() + + +def test_insert_docs(db, example_documents): + # Test exception if one attempts to upsert with insert + with pytest.raises(ValueError) as exc: + db.insert_docs(example_documents, upsert=True) + assert "use update_docs with upsert=True" in str(exc.value) + + # Test that there's an active collection + with pytest.raises(ValueError) as exc: + db.insert_docs(example_documents) + assert "No collection is specified" in str(exc.value) + + # Create a collection + collection = db.create_collection(MONGODB_COLLECTION) + # Create a search index + if MONGODB_INDEX not in collection.list_search_indexes(): + db.create_vector_search_index(collection, MONGODB_INDEX) + + # Insert example documents + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + found = list(collection.find({})) + assert len(found) == len(example_documents) + # Check that documents have correct fields, including "_id" and "embedding" but not "id" + assert all([set(doc.keys()) == {"_id", "content", "metadata", "embedding"} for doc in found]) + # Check ids + assert {doc["_id"] for doc in found} == {1, "1", 2, "2"} + # Check embedding lengths + assert len(found[0]["embedding"]) == 384 + + +def test_update_docs(db_with_indexed_clxn, example_documents): + db, collection = db_with_indexed_clxn + # Use update_docs to insert new documents + db.update_docs(example_documents, MONGODB_COLLECTION, upsert=True) + # Test that no changes were made to example_documents + assert set(example_documents[0].keys()) == {"id", "content", "metadata"} + assert collection.count_documents({}) == len(example_documents) + found = list(collection.find({})) + # Check that documents have correct fields, including "_id" and "embedding" but not "id" + assert all([set(doc.keys()) == {"_id", "content", "metadata", "embedding"} for doc in found]) + # Check ids + assert {doc["_id"] for doc in found} == {1, "1", 2, "2"} + + # Update an *existing* Document + updated_doc = Document(id=1, content="Cats are tough.", metadata={"a": 10}) + db.update_docs([updated_doc], MONGODB_COLLECTION) + assert collection.find_one({"_id": 1})["content"] == "Cats are tough." + + # Upsert a *new* Document + new_id = 3 + new_doc = Document(id=new_id, content="Cats are tough.") + db.update_docs([new_doc], MONGODB_COLLECTION, upsert=True) + assert collection.find_one({"_id": new_id})["content"] == "Cats are tough." + + # Attempting to use update to insert a new doc + # *without* setting upsert set to True + # is a no-op in MongoDB. # TODO Confirm behaviour and autogen's preference. + new_id = 4 + new_doc = Document(id=new_id, content="That is NOT a sandwich?") + db.update_docs([new_doc], MONGODB_COLLECTION) + assert collection.find_one({"_id": new_id}) is None + + +def test_delete_docs(db_with_indexed_clxn, example_documents): + db, collection = db_with_indexed_clxn + # Insert example documents + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + # Delete the 1s + db.delete_docs(ids=[1, "1"], collection_name=MONGODB_COLLECTION) + # Confirm just the 2s remain + clxn = db.get_collection(MONGODB_COLLECTION) + assert {2, "2"} == {doc["_id"] for doc in clxn.find({})} + + +def test_get_docs_by_ids(db_with_indexed_clxn, example_documents): + db, collection = db_with_indexed_clxn + # Insert example documents + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + + # Test without setting "include" kwarg + docs = db.get_docs_by_ids(ids=[2, "2"], collection_name=MONGODB_COLLECTION) + assert len(docs) == 2 + assert all([doc["id"] in [2, "2"] for doc in docs]) + assert set(docs[0].keys()) == {"id", "content", "metadata"} + + # Test with include + docs = db.get_docs_by_ids(ids=[2], include=["content"], collection_name=MONGODB_COLLECTION) + assert len(docs) == 1 + assert set(docs[0].keys()) == {"id", "content"} + + # Test with empty ids list + docs = db.get_docs_by_ids(ids=[], include=["content"], collection_name=MONGODB_COLLECTION) + assert len(docs) == 0 + + +def test_retrieve_docs(db, example_documents): + # Create collection + collection = db.get_collection(MONGODB_COLLECTION) + # Sanity test. Retrieving docs before documents have been added + results = db.retrieve_docs( + queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2, index_name=MONGODB_INDEX ) - assert [[r[0]["id"] for r in rr] for rr in res] == [["2"], ["3"]] - # test_get_docs_by_ids - res = db.get_docs_by_ids(["1", "2"], collection_name) - assert [r["id"] for r in res] == ["2"] # "1" has been deleted - res = db.get_docs_by_ids(collection_name=collection_name) - assert set([r["id"] for r in res]) == set(["2", "3"]) # All Docs returned + assert results == [] + # Insert example documents + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + + # Sanity test. Retrieving docs before the search index had been created + with pytest.raises(AssertionError) as exc: + db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2, index_name=MONGODB_INDEX) + assert "There are no search indexes" in str(exc.value) + # Create the index + db.create_vector_search_index(collection=collection, index_name=MONGODB_INDEX) + + # Begin testing Atlas Vector Search + # NOTE: Indexing may take some time, so we must be patient on the first query. + # Immediately adding documents and then querying is only standard for testing - # test_delete_collection - db.delete_collection(collection_name) - # check if the collection is deleted - pytest.raises(ValueError, db.get_collection, collection_name) + n_results = 2 # Number of closest docs to return + + success = False + retries = RETRIES + while retries and not success: + results = db.retrieve_docs( + queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results, index_name=MONGODB_INDEX + ) + if len(results[0]) == n_results: + success = True + else: + retries -= 1 + sleep(DELAY) + if not success: + raise OperationFailure(f"Failed to retrieve docs after {RETRIES} retries, waiting {DELAY} seconds after each.") + + assert {doc[0]["id"] for doc in results[0]} == {1, 2} + + # Empty list of queries returns empty list of results + results = db.retrieve_docs( + queries=[], collection_name=MONGODB_COLLECTION, n_results=n_results, index_name=MONGODB_INDEX + ) + assert results == [] + + # Empty list of queries returns empty list of results + queries = ["Some good pets", "What kind of Sandwich?"] + results = db.retrieve_docs( + queries=queries, collection_name=MONGODB_COLLECTION, n_results=2, index_name=MONGODB_INDEX + ) + assert len(results) == len(queries) + assert all([len(res) == n_results for res in results]) + assert {doc[0]["id"] for doc in results[0]} == {1, 2} + assert {doc[0]["id"] for doc in results[1]} == {"1", "2"} -if __name__ == "__main__": - test_mongodb() +def test_search_indexes(db): + pass + # TODO From a367426a2f5a2deaa081e97faea0703b36eecef8 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Mon, 24 Jun 2024 11:44:36 -0400 Subject: [PATCH 10/33] Update mongodb.py --- autogen/agentchat/contrib/vectordb/mongodb.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 856808f5dc94..3067b81c407d 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -32,6 +32,7 @@ def __init__( database_name: str = "vector_db", embedding_function: Callable = SentenceTransformer("all-MiniLM-L6-v2").encode, collection_name: str = None, + index_name: str = "default_index", ): """ Initialize the vector database. @@ -62,7 +63,6 @@ def __init__( # index lookup self.database_name = database_name self.index_name = index_name - self.similarity = similarity def list_collections(self): """ @@ -234,7 +234,8 @@ def insert_docs( id_batch.append(id) text_batch.append(text) metadata_batch.append(metadata) - size += len(text) + len(metadata) # todo consider len(id) when str|int + id_size = 1 if isinstance(id, int) else len(id) + size += len(text) + len(metadata) + id_size if (i + 1) % batch_size == 0 or size >= 47_000_000: result_ids.update(self._insert_batch(collection, text_batch, metadata_batch, id_batch)) input_ids.update(id_batch) @@ -395,7 +396,7 @@ def retrieve_docs( # Ensure that there is at least one search index search_indexes = list(collection.list_search_indexes()) assert len(search_indexes), f"There are no search indexes for {collection.name}" - + results = [] for query_text in queries: # Compute embedding vector from semantic query @@ -405,7 +406,7 @@ def retrieve_docs( query_vector, n_results, collection, - index_name, + self.index_name, distance_threshold, kwargs.get("oversampling_factor", 10), ) From ffa3e382a0969730c84c63e7601da380029a3d35 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Mon, 24 Jun 2024 12:02:11 -0400 Subject: [PATCH 11/33] Update mongodb.py - Casey --- autogen/agentchat/contrib/vectordb/mongodb.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 3067b81c407d..64702fbb2ae7 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -32,7 +32,6 @@ def __init__( database_name: str = "vector_db", embedding_function: Callable = SentenceTransformer("all-MiniLM-L6-v2").encode, collection_name: str = None, - index_name: str = "default_index", ): """ Initialize the vector database. @@ -60,9 +59,6 @@ def __init__( sentences = ["The weather is lovely today in paradise."] embeddings = self.embedding_function(sentences) self.dimensions = len(embeddings[0]) - # index lookup - self.database_name = database_name - self.index_name = index_name def list_collections(self): """ @@ -368,6 +364,7 @@ def retrieve_docs( collection_name: str = None, n_results: int = 10, distance_threshold: float = -1, + index_name: str = "default", **kwargs, ) -> QueryResults: """ @@ -396,7 +393,7 @@ def retrieve_docs( # Ensure that there is at least one search index search_indexes = list(collection.list_search_indexes()) assert len(search_indexes), f"There are no search indexes for {collection.name}" - + results = [] for query_text in queries: # Compute embedding vector from semantic query @@ -406,7 +403,7 @@ def retrieve_docs( query_vector, n_results, collection, - self.index_name, + index_name, distance_threshold, kwargs.get("oversampling_factor", 10), ) From 3646d1e35681f6905a200048379038ef8ba20778 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Sun, 30 Jun 2024 01:05:10 -0400 Subject: [PATCH 12/33] search_index_magic index_name change; keeping track of lucene indexes is tricky --- autogen/agentchat/contrib/vectordb/mongodb.py | 163 +++++++++++------- .../contrib/vectordb/test_mongodb.py | 10 +- 2 files changed, 109 insertions(+), 64 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 64702fbb2ae7..dc1185f9bba4 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -1,4 +1,5 @@ from copy import deepcopy +from time import sleep from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Set, Tuple, Union import numpy as np @@ -6,6 +7,7 @@ from pymongo.collection import Collection from pymongo.cursor import Cursor from pymongo.operations import SearchIndexModel +from pymongo.errors import OperationFailure from sentence_transformers import SentenceTransformer from .base import Document, ItemID, QueryResults, VectorDB @@ -32,6 +34,7 @@ def __init__( database_name: str = "vector_db", embedding_function: Callable = SentenceTransformer("all-MiniLM-L6-v2").encode, collection_name: str = None, + index_name: str = "vector_index", ): """ Initialize the vector database. @@ -59,6 +62,8 @@ def __init__( sentences = ["The weather is lovely today in paradise."] embeddings = self.embedding_function(sentences) self.dimensions = len(embeddings[0]) + # MongoDB Atlas Search Index + self.index_name = index_name def list_collections(self): """ @@ -80,8 +85,6 @@ def create_collection( Args: collection_name: str | The name of the collection. - index_name: str | The name of the index. - similarity: str | The similarity metric for the vector search index. overwrite: bool | Whether to overwrite the collection if it exists. Default is False. get_or_create: bool | Whether to get the collection if it exists. Default is True """ @@ -103,6 +106,36 @@ def create_collection( else: # get_or_create is False and the collection already exists, raise an error. raise ValueError(f"Collection {collection_name} already exists.") + def create_index_if_not_exists(self, index_name: str = "vector_index", collection: Collection = None): + """ + Creates a vector search index on the specified collection in MongoDB. + + Args: + MONGODB_INDEX (str, optional): The name of the vector search index to create. Defaults to "vector_search_index". + collection (Collection, optional): The MongoDB collection to create the index on. Defaults to None. + + Returns: + bool: True if the index was successfully created, False otherwise. + """ + success = False + # Check if the index already exists + if index_name not in collection.list_search_indexes(): + # Define retry logic with exponential backoff + retries = 3 + delay = 3 + while retries and not success: + try: + # Attempt to create the vector search index + self.create_vector_search_index(collection, index_name) + success = True + except OperationFailure: + # Handle potential operation failure + retries -= 1 + sleep(delay) + delay *= 2 # Increase delay for next retry + else: # index exists + success = True + return success def get_collection(self, collection_name: str = None) -> Collection: """ @@ -139,7 +172,7 @@ def delete_collection(self, collection_name: str) -> None: def create_vector_search_index( self, collection: Collection, - index_name: Union[str, None] = "default_index", + index_name: Union[str, None] = "vector_index", similarity: Literal["euclidean", "cosine", "dotProduct"] = "cosine", ) -> None: """Create a vector search index in the collection. @@ -170,6 +203,16 @@ def create_vector_search_index( # Create the search index try: collection.create_search_index(model=search_index_model) + # Wait for the index to be created + keep_trying = True + while keep_trying: + indexes = collection.list_search_indexes() + for index in indexes: + if index["name"] == index_name and index["status"] == "READY": + keep_trying = False + else: + sleep(2) # 2s delay between checks + logger.info(f"Search index {index_name} created successfully.") except Exception as e: logger.error( f"Error creating search index: {e}. \n" @@ -178,7 +221,11 @@ def create_vector_search_index( f"if you are on a free/shared cluster." ) raise e - + def upsert_docs(self, docs, collection): + for doc in docs: + query = {"id": doc["id"]} + new_values = {"$set": doc} + collection.update_one(query, new_values, upsert=True) def insert_docs( self, docs: List[Document], @@ -201,57 +248,52 @@ def insert_docs( logger.info("No documents to insert.") return - if upsert: - raise ValueError( - "If attempting to upsert, please use update_docs with upsert=True. " - "upsert is a function of update in MongoDB. " - "It is most efficient if done there." - ) - collection = self.get_collection(collection_name) - - # Sanity checking the first document - if docs[0].get("content") is None: - raise ValueError("The document content is required.") - if docs[0].get("id") is None: - raise ValueError("The document id is required.") - - input_ids = set() - result_ids = set() - id_batch = [] - text_batch = [] - metadata_batch = [] - size = 0 - i = 0 - for doc in docs: - id = doc["id"] - text = doc["content"] - metadata = doc.get("metadata", {}) - id_batch.append(id) - text_batch.append(text) - metadata_batch.append(metadata) - id_size = 1 if isinstance(id, int) else len(id) - size += len(text) + len(metadata) + id_size - if (i + 1) % batch_size == 0 or size >= 47_000_000: - result_ids.update(self._insert_batch(collection, text_batch, metadata_batch, id_batch)) + if upsert: + self.upsert_docs(docs, collection) + else: + # Sanity checking the first document + if docs[0].get("content") is None: + raise ValueError("The document content is required.") + if docs[0].get("id") is None: + raise ValueError("The document id is required.") + + input_ids = set() + result_ids = set() + id_batch = [] + text_batch = [] + metadata_batch = [] + size = 0 + i = 0 + for doc in docs: + id = doc["id"] + text = doc["content"] + metadata = doc.get("metadata", {}) + id_batch.append(id) + text_batch.append(text) + metadata_batch.append(metadata) + id_size = 1 if isinstance(id, int) else len(id) + size += len(text) + len(metadata) + id_size + if (i + 1) % batch_size == 0 or size >= 47_000_000: + result_ids.update(self._insert_batch(collection, text_batch, metadata_batch, id_batch)) + input_ids.update(id_batch) + id_batch = [] + text_batch = [] + metadata_batch = [] + size = 0 + i += 1 + if text_batch: + result_ids.update(self._insert_batch(collection, text_batch, metadata_batch, id_batch)) # type: ignore input_ids.update(id_batch) - id_batch = [] - text_batch = [] - metadata_batch = [] - size = 0 - i += 1 - if text_batch: - result_ids.update(self._insert_batch(collection, text_batch, metadata_batch, id_batch)) # type: ignore - input_ids.update(id_batch) - - if result_ids != input_ids: - logger.warning( - "Possible data corruption. " - "input_ids not in result_ids: {in_diff}.\n" - "result_ids not in input_ids: {out_diff}".format( - in_diff=input_ids.difference(result_ids), out_diff=result_ids.difference(input_ids) + + if result_ids != input_ids: + logger.warning( + "Possible data corruption. " + "input_ids not in result_ids: {in_diff}.\n" + "result_ids not in input_ids: {out_diff}".format( + in_diff=input_ids.difference(result_ids), out_diff=result_ids.difference(input_ids) + ) ) - ) def _insert_batch( self, collection: Collection, texts: List[str], metadatas: List[Mapping[str, Any]], ids: List[ItemID] @@ -352,11 +394,15 @@ def get_docs_by_ids( include_fields = {"_id": 1, "content": 1, "metadata": 1} else: include_fields = {k: 1 for k in set(include).union({"_id"})} - collection = self.get_collection(collection_name) - docs = collection.find({"_id": {"$in": ids}}, include_fields) - # Return with _id field from Collection into id for Document - return with_id_rename(docs) + if ids is not None: + docs = collection.find({"_id": {"$in": ids}}, include_fields) + # Return with _id field from Collection into id for Document + return with_id_rename(docs) + else: + docs = collection.find({}, include_fields) + # Return with _id field from Collection into id for Document + return with_id_rename(docs) def retrieve_docs( self, @@ -364,7 +410,6 @@ def retrieve_docs( collection_name: str = None, n_results: int = 10, distance_threshold: float = -1, - index_name: str = "default", **kwargs, ) -> QueryResults: """ @@ -384,8 +429,8 @@ def retrieve_docs( Returns: QueryResults | For each query string, a list of nearest documents and their scores. """ - collection = self.get_collection(collection_name) + self.create_index_if_not_exists(index_name=self.index_name, collection=collection) # Trivial case of an empty collection if collection.count_documents({}) == 0: return [] @@ -403,7 +448,7 @@ def retrieve_docs( query_vector, n_results, collection, - index_name, + self.index_name, distance_threshold, kwargs.get("oversampling_factor", 10), ) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index bd9ed42e66e4..1825d851d829 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -238,7 +238,7 @@ def test_retrieve_docs(db, example_documents): collection = db.get_collection(MONGODB_COLLECTION) # Sanity test. Retrieving docs before documents have been added results = db.retrieve_docs( - queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2, index_name=MONGODB_INDEX + queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2 ) assert results == [] # Insert example documents @@ -246,7 +246,7 @@ def test_retrieve_docs(db, example_documents): # Sanity test. Retrieving docs before the search index had been created with pytest.raises(AssertionError) as exc: - db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2, index_name=MONGODB_INDEX) + db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) assert "There are no search indexes" in str(exc.value) # Create the index db.create_vector_search_index(collection=collection, index_name=MONGODB_INDEX) @@ -261,7 +261,7 @@ def test_retrieve_docs(db, example_documents): retries = RETRIES while retries and not success: results = db.retrieve_docs( - queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results, index_name=MONGODB_INDEX + queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results ) if len(results[0]) == n_results: success = True @@ -275,14 +275,14 @@ def test_retrieve_docs(db, example_documents): # Empty list of queries returns empty list of results results = db.retrieve_docs( - queries=[], collection_name=MONGODB_COLLECTION, n_results=n_results, index_name=MONGODB_INDEX + queries=[], collection_name=MONGODB_COLLECTION, n_results=n_results ) assert results == [] # Empty list of queries returns empty list of results queries = ["Some good pets", "What kind of Sandwich?"] results = db.retrieve_docs( - queries=queries, collection_name=MONGODB_COLLECTION, n_results=2, index_name=MONGODB_INDEX + queries=queries, collection_name=MONGODB_COLLECTION, n_results=2 ) assert len(results) == len(queries) assert all([len(res) == n_results for res in results]) From 3e0ac8e78b179a1513c086a749aec77e8799cf79 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 30 Jun 2024 21:05:18 +0800 Subject: [PATCH 13/33] Fix format --- autogen/agentchat/contrib/vectordb/mongodb.py | 11 +++++++---- test/agentchat/contrib/vectordb/test_mongodb.py | 17 +++++------------ 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index dc1185f9bba4..05a930c1c921 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -6,8 +6,8 @@ from pymongo import MongoClient, UpdateOne, errors from pymongo.collection import Collection from pymongo.cursor import Cursor -from pymongo.operations import SearchIndexModel from pymongo.errors import OperationFailure +from pymongo.operations import SearchIndexModel from sentence_transformers import SentenceTransformer from .base import Document, ItemID, QueryResults, VectorDB @@ -106,6 +106,7 @@ def create_collection( else: # get_or_create is False and the collection already exists, raise an error. raise ValueError(f"Collection {collection_name} already exists.") + def create_index_if_not_exists(self, index_name: str = "vector_index", collection: Collection = None): """ Creates a vector search index on the specified collection in MongoDB. @@ -133,7 +134,7 @@ def create_index_if_not_exists(self, index_name: str = "vector_index", collectio retries -= 1 sleep(delay) delay *= 2 # Increase delay for next retry - else: # index exists + else: # index exists success = True return success @@ -211,7 +212,7 @@ def create_vector_search_index( if index["name"] == index_name and index["status"] == "READY": keep_trying = False else: - sleep(2) # 2s delay between checks + sleep(2) # 2s delay between checks logger.info(f"Search index {index_name} created successfully.") except Exception as e: logger.error( @@ -221,11 +222,13 @@ def create_vector_search_index( f"if you are on a free/shared cluster." ) raise e + def upsert_docs(self, docs, collection): for doc in docs: query = {"id": doc["id"]} new_values = {"$set": doc} collection.update_one(query, new_values, upsert=True) + def insert_docs( self, docs: List[Document], @@ -251,7 +254,7 @@ def insert_docs( collection = self.get_collection(collection_name) if upsert: self.upsert_docs(docs, collection) - else: + else: # Sanity checking the first document if docs[0].get("content") is None: raise ValueError("The document content is required.") diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 1825d851d829..27c414972b68 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -116,6 +116,7 @@ def create_collection(collection_name: str, collection = db.create_collection(collection_name, overwrite=False, get_or_create=True) assert collection.name == collection_name + def test_get_collection(db): collection_name = MONGODB_COLLECTION @@ -237,9 +238,7 @@ def test_retrieve_docs(db, example_documents): # Create collection collection = db.get_collection(MONGODB_COLLECTION) # Sanity test. Retrieving docs before documents have been added - results = db.retrieve_docs( - queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2 - ) + results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) assert results == [] # Insert example documents db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) @@ -260,9 +259,7 @@ def test_retrieve_docs(db, example_documents): success = False retries = RETRIES while retries and not success: - results = db.retrieve_docs( - queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results - ) + results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) if len(results[0]) == n_results: success = True else: @@ -274,16 +271,12 @@ def test_retrieve_docs(db, example_documents): assert {doc[0]["id"] for doc in results[0]} == {1, 2} # Empty list of queries returns empty list of results - results = db.retrieve_docs( - queries=[], collection_name=MONGODB_COLLECTION, n_results=n_results - ) + results = db.retrieve_docs(queries=[], collection_name=MONGODB_COLLECTION, n_results=n_results) assert results == [] # Empty list of queries returns empty list of results queries = ["Some good pets", "What kind of Sandwich?"] - results = db.retrieve_docs( - queries=queries, collection_name=MONGODB_COLLECTION, n_results=2 - ) + results = db.retrieve_docs(queries=queries, collection_name=MONGODB_COLLECTION, n_results=2) assert len(results) == len(queries) assert all([len(res) == n_results for res in results]) assert {doc[0]["id"] for doc in results[0]} == {1, 2} From 95e2f792c454929cad3c78f2b3f606c43a178ca6 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 30 Jun 2024 22:57:47 +0800 Subject: [PATCH 14/33] Fix tests --- .github/workflows/contrib-tests.yml | 3 --- autogen/agentchat/contrib/vectordb/base.py | 3 ++- .../contrib/vectordb/test_mongodb.py | 19 +++++++------------ 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index 7c936c467b2a..ee0315ad9b65 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -89,9 +89,6 @@ jobs: image: mongodb/mongodb-atlas-local:latest ports: - 27017:27017 - env: - MONGODB_INITDB_ROOT_USERNAME: mongodb_user - MONGODB_INITDB_ROOT_PASSWORD: mongodb_password steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/autogen/agentchat/contrib/vectordb/base.py b/autogen/agentchat/contrib/vectordb/base.py index 41b8f143ab83..788267308e3f 100644 --- a/autogen/agentchat/contrib/vectordb/base.py +++ b/autogen/agentchat/contrib/vectordb/base.py @@ -171,7 +171,8 @@ def get_docs_by_ids( ids: List[ItemID] | A list of document ids. If None, will return all the documents. Default is None. collection_name: str | The name of the collection. Default is None. include: List[str] | The fields to include. Default is None. - If None, will include ["metadata", "content"], ids will always be included. # TODO - Confirm keys + If None, will include ["metadatas", "documents"], ids will always be included. This may differ + depending on the implementation. kwargs: dict | Additional keyword arguments. Returns: diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 27c414972b68..0c703cb9175e 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) -MONGODB_URI = os.environ.get("MONGODB_URI") +MONGODB_URI = os.environ.get("MONGODB_URI", "mongodb://localhost:27017/?directConnection=true") MONGODB_DATABASE = os.environ.get("DATABASE", "autogen_test_db") MONGODB_COLLECTION = os.environ.get("MONGODB_COLLECTION", "autogen_test_vectorstore") MONGODB_INDEX = os.environ.get("MONGODB_INDEX", "vector_index") @@ -31,9 +31,6 @@ RETRIES = 10 DELAY = 2 -if "MONGODB_URI" not in os.environ: - pytest.skip(allow_module_level=True) - @pytest.fixture def db(): @@ -141,17 +138,16 @@ def test_delete_collection(db): def test_insert_docs(db, example_documents): - # Test exception if one attempts to upsert with insert - with pytest.raises(ValueError) as exc: - db.insert_docs(example_documents, upsert=True) - assert "use update_docs with upsert=True" in str(exc.value) - # Test that there's an active collection with pytest.raises(ValueError) as exc: db.insert_docs(example_documents) assert "No collection is specified" in str(exc.value) + # Test upsert + db.insert_docs(example_documents, MONGODB_COLLECTION, upsert=True) + # Create a collection + db.delete_collection(MONGODB_COLLECTION) collection = db.create_collection(MONGODB_COLLECTION) # Create a search index if MONGODB_INDEX not in collection.list_search_indexes(): @@ -236,6 +232,7 @@ def test_get_docs_by_ids(db_with_indexed_clxn, example_documents): def test_retrieve_docs(db, example_documents): # Create collection + db.delete_collection(MONGODB_COLLECTION) collection = db.get_collection(MONGODB_COLLECTION) # Sanity test. Retrieving docs before documents have been added results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) @@ -244,9 +241,7 @@ def test_retrieve_docs(db, example_documents): db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) # Sanity test. Retrieving docs before the search index had been created - with pytest.raises(AssertionError) as exc: - db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) - assert "There are no search indexes" in str(exc.value) + db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) # Create the index db.create_vector_search_index(collection=collection, index_name=MONGODB_INDEX) From 64a157c08e17cbe9c9cbeeb1bfce4dd0e00ef005 Mon Sep 17 00:00:00 2001 From: Fabian Valle Date: Mon, 1 Jul 2024 09:09:56 -0400 Subject: [PATCH 15/33] hacking trying to figure this out --- autogen/agentchat/contrib/vectordb/mongodb.py | 26 +- notebook/agentchat_mongodb_RetrieveChat.ipynb | 300 +++++++++++++----- 2 files changed, 241 insertions(+), 85 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index dc1185f9bba4..40df2add2a15 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -95,14 +95,19 @@ def create_collection( collection_names = self.db.list_collection_names() if collection_name not in collection_names: # Create a new collection - return self.db.create_collection(collection_name) - + coll = self.db.create_collection(collection_name) + self.create_index_if_not_exists(index_name=self.index_name, collection=coll) + return coll if overwrite: self.db.drop_collection(collection_name) - + coll = self.db.create_collection(collection_name) + self.create_index_if_not_exists(index_name=self.index_name, collection=coll) + return coll if get_or_create: # The collection already exists, return it. - return self.db[collection_name] + coll = self.db[collection_name] + self.create_index_if_not_exists(index_name=self.index_name, collection=coll) + return coll else: # get_or_create is False and the collection already exists, raise an error. raise ValueError(f"Collection {collection_name} already exists.") @@ -224,6 +229,7 @@ def create_vector_search_index( def upsert_docs(self, docs, collection): for doc in docs: query = {"id": doc["id"]} + doc["embedding"] = np.array(self.embedding_function([doc["content"]])).tolist()[0] new_values = {"$set": doc} collection.update_one(query, new_values, upsert=True) def insert_docs( @@ -430,7 +436,6 @@ def retrieve_docs( QueryResults | For each query string, a list of nearest documents and their scores. """ collection = self.get_collection(collection_name) - self.create_index_if_not_exists(index_name=self.index_name, collection=collection) # Trivial case of an empty collection if collection.count_documents({}) == 0: return [] @@ -438,10 +443,17 @@ def retrieve_docs( # Ensure that there is at least one search index search_indexes = list(collection.list_search_indexes()) assert len(search_indexes), f"There are no search indexes for {collection.name}" - + # Check status of index! + for index in search_indexes: + if index["name"] == self.index_name and index["type"] == "vectorSearch" and index["status"] != "READY": + raise Exception(f"Index {self.index_name} is not ready!") + logger.info(f"Using index: {str(list(search_indexes))}") results = [] + sleep(15) for query_text in queries: # Compute embedding vector from semantic query + print('query_text', query_text) + logger.info(f"Query: {query_text}") query_vector = np.array(self.embedding_function([query_text])).tolist()[0] # Find documents with similar vectors using the specified index query_result = _vector_search( @@ -497,6 +509,8 @@ def _vector_search( }, {"$set": {"score": {"$meta": "vectorSearchScore"}}}, ] + print("pipeline: ", pipeline) + logger.info("pipeline: %s", pipeline) if distance_threshold >= 0.0: similarity_threshold = 1.0 - distance_threshold pipeline.append({"$match": {"score": {"gte": similarity_threshold}}}) diff --git a/notebook/agentchat_mongodb_RetrieveChat.ipynb b/notebook/agentchat_mongodb_RetrieveChat.ipynb index aca2f1174921..f1e696fe6e98 100644 --- a/notebook/agentchat_mongodb_RetrieveChat.ipynb +++ b/notebook/agentchat_mongodb_RetrieveChat.ipynb @@ -70,7 +70,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "models to use: ['gpt-3.5-turbo-0125', 'gpt-35-turbo']\n" + "models to use: ['gpt-35-turbo']\n" ] } ], @@ -87,7 +87,6 @@ "from autogen.retrieve_utils import TEXT_FORMATS\n", "\n", "config_list = [\n", - " {\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": \"\", \"api_type\": \"openai\"},\n", " {\n", " \"model\": \"gpt-35-turbo\",\n", " \"base_url\": \"\",\n", @@ -137,24 +136,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[autogen.oai.client: 06-22 21:19:19] {150} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-06-22 21:19:19,131 WARNING autogen.oai.client: The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" - ] - } - ], + "outputs": [], "source": [ "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", "assistant = RetrieveAssistantAgent(\n", @@ -192,10 +176,11 @@ " \"chunk_token_size\": 2000,\n", " \"model\": config_list[0][\"model\"],\n", " \"vector_db\": \"mongodb\", # MongoDB Atlas database\n", - " \"collection_name\": \"flaml_collection\",\n", + " \"collection_name\": \"demo_collection\",\n", " \"db_config\": {\n", - " \"connection_string\": \"\", # MongoDB Atlas connection string\n", - " \"database_name\": \"test_db\", # MongoDB Atlas database\n", + " \"connection_string\": \"\", # MongoDB Atlas connection string\n", + " \"database_name\": \"\", # MongoDB Atlas database\n", + " \"index_name\":\"vector_index\",\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", @@ -227,27 +212,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "Trying to create collection.\n", - "Creating index on flaml_collection. Let's wait for the index to be READY.\n", - "default_index on flaml_collection is READY.\n" + "Trying to create collection.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-22 21:19:45,413 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n" + "2024-07-01 08:50:43,934 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Search index vector_index created successfully.\u001b[0m\n", + "2024-07-01 08:50:44,612 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "2024-07-01 08:50:45,064 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", + "2024-07-01 08:51:00,069 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", + "2024-07-01 08:51:00,164 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 20, 'numCandidates': 200, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "index is ready to use.\n", - "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", - "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 20, 'numCandidates': 20, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", - "VectorDB returns doc_ids: [['bdfbc921']]\n", - "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n", + "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 20, 'numCandidates': 200, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", + "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n", + "\u001b[32mAdding content of doc 6682a624b9758026836ffab5 to context.\u001b[0m\n", + "\u001b[32mAdding content of doc 6682a624b9758026836ffad5 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -391,6 +378,120 @@ "```\n", "\n", "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "# Research\n", + "\n", + "For technical details, please check our research publications.\n", + "\n", + "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", + "```\n", + "\n", + "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", + "```\n", + "\n", + "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", + "```\n", + "\n", + "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", + "```\n", + "\n", + "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", + "\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", + "}\n", + "```\n", + "\n", + "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", + "\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", + "```\n", + "\n", + "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", + "\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", + "```\n", + "\n", + "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", + "```\n", "\n", "\n", "\n", @@ -413,45 +514,30 @@ "text": [ "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "To perform a classification task using FLAML and Spark for parallel training, you can use the `lgbm_spark` estimator and activate Spark as the parallel backend during parallel tuning by setting the `use_spark` to `true`. You can also train for 30 seconds and force cancel jobs if the time limit is reached by setting `time_budget` and `force_cancel` in the `automl_settings`. Here's an example code snippet to perform classification:\n", + "You can activate Spark as the parallel backend during parallel tuning in both AutoML and Hyperparameter Tuning, by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using joblib-spark. According to the documentation, to use FLAML with Spark, you need to prepare your data in pandas-on-spark format using the `flaml.automl.spark.utils.to_pandas_on_spark` function in the `flaml.automl.spark.utils` module. Then, you can pass pandas-on-spark data to FLAML as normal data using `dataframe` and `label`. For example, to use SparkML models for regression and train for 30 seconds with force cancel, you can use the following code snippet:\n", "\n", "```python\n", "import flaml\n", - "import pandas as pd\n", "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "from pyspark.ml.feature import VectorAssembler\n", "\n", - "# Prepare your data in pandas-on-spark format\n", - "data = {\n", - " \"feature1\": [0,1,1,0,1],\n", - " \"feature2\": [1,0,1,1,0],\n", - " \"feature3\": [0,1,0,1,1],\n", - " \"class\": [0,0,1,0,1],\n", - "}\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"class\"\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "# load your data into a pandas dataframe\n", + "train_data = ...\n", "\n", - "# Use FLAML to perform classification with Spark for parallel training\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30, # Set time budget to 30 seconds\n", - " \"metric\": \"accuracy\", # Use accuracy metric for classification\n", - " \"task\": \"classification\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # Use lgbm_spark estimator for classification\n", - " \"n_concurrent_trials\": 2, # Set number of concurrent trials to 2\n", - " \"use_spark\": True, # Activate Spark as parallel backend\n", - " \"force_cancel\": True, # Force cancel jobs if time limit is reached\n", + "psdf = to_pandas_on_spark(train_data)\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True,\n", "}\n", "\n", - "automl_experiment.fit(\n", + "automl.fit(\n", " dataframe=psdf,\n", - " label=label,\n", - " **automl_settings,\n", + " label=label_name,\n", + " **settings,\n", ")\n", "```\n", "\n", @@ -463,26 +549,82 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "UPDATE CONTEXT\n", + "UPDATE CONTEXT. Please provide more information or context for me to assist you better.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "index is ready to use.\n", - "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", - "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 60, 'numCandidates': 60, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", - "VectorDB returns doc_ids: [['bdfbc921']]\n", - "index is ready to use.\n", - "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", - "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 100, 'numCandidates': 100, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", - "VectorDB returns doc_ids: [['bdfbc921']]\n", - "index is ready to use.\n", - "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", - "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 140, 'numCandidates': 140, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", - "VectorDB returns doc_ids: [['bdfbc921']]\n", - "index is ready to use.\n", - "{'id': '6677781cbb83ea33c40099e1', 'name': 'default_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24, 336000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 6, 23, 1, 19, 24)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}\n", - "Now running pipeline: [{'$vectorSearch': {'index': 'default_index', 'limit': 180, 'numCandidates': 180, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$project': {'score': {'$meta': 'vectorSearchScore'}}}, {'$lookup': {'from': 'flaml_collection', 'localField': '_id', 'foreignField': '_id', 'as': 'full_document_array'}}, {'$addFields': {'full_document': {'$arrayElemAt': [{'$map': {'input': '$full_document_array', 'as': 'doc', 'in': {'id': '$$doc.id', 'content': '$$doc.content'}}}, 0]}}}, {'$project': {'full_document_array': 0, 'embedding': 0}}]\n", - "VectorDB returns doc_ids: [['bdfbc921']]\n", + "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-01 08:51:06,587 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", + "2024-07-01 08:51:21,592 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", + "2024-07-01 08:51:21,690 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 60, 'numCandidates': 600, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 60, 'numCandidates': 600, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", + "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-01 08:51:22,212 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", + "2024-07-01 08:51:37,216 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", + "2024-07-01 08:51:37,286 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 100, 'numCandidates': 1000, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 100, 'numCandidates': 1000, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", + "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-01 08:51:37,885 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", + "2024-07-01 08:51:52,889 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", + "2024-07-01 08:51:52,975 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 140, 'numCandidates': 1400, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 140, 'numCandidates': 1400, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", + "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-01 08:51:53,494 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", + "2024-07-01 08:52:08,496 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", + "2024-07-01 08:52:08,542 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 180, 'numCandidates': 1800, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 180, 'numCandidates': 1800, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", + "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n", "\u001b[32mNo more context, will terminate.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", From 74054636bd62b7969eef6b6fa85777691437959a Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 18 Jul 2024 15:48:56 -0400 Subject: [PATCH 16/33] Streamline checks for indexes in construction and restructure tests --- autogen/agentchat/contrib/vectordb/mongodb.py | 159 +++++++++--------- .../contrib/vectordb/test_mongodb.py | 145 ++++++++-------- 2 files changed, 149 insertions(+), 155 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 01a7f45f3268..126482f53a13 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -1,12 +1,10 @@ from copy import deepcopy -from time import sleep +from time import monotonic, sleep from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Set, Tuple, Union import numpy as np from pymongo import MongoClient, UpdateOne, errors from pymongo.collection import Collection -from pymongo.cursor import Cursor -from pymongo.errors import OperationFailure from pymongo.operations import SearchIndexModel from sentence_transformers import SentenceTransformer @@ -16,6 +14,9 @@ logger = get_logger(__name__) DEFAULT_INSERT_BATCH_SIZE = 100_000 +_SAMPLE_SENTENCE = ["The weather is lovely today in paradise."] +_TIMEOUT = 20.0 +_DELAY = 0.5 def with_id_rename(docs: Iterable) -> List[Dict[str, Any]]: @@ -35,6 +36,8 @@ def __init__( embedding_function: Callable = SentenceTransformer("all-MiniLM-L6-v2").encode, collection_name: str = None, index_name: str = "vector_index", + overwrite: bool = False, + wait_until_ready: bool = False, ): """ Initialize the vector database. @@ -43,27 +46,61 @@ def __init__( connection_string: str | The MongoDB connection string to connect to. Default is ''. database_name: str | The name of the database. Default is 'vector_db'. embedding_function: The embedding function used to generate the vector representation. + overwrite: bool | Overwrite existing collection with new information from this object + defaults to False + wait_until_ready: bool | Blocking call to wait until the database indexes are READY + will timeout after 20 seconds. Defaults to False """ self.embedding_function = embedding_function + self.index_name = index_name + self.overwrite = overwrite + self._wait_until_ready = wait_until_ready + + # This will get the model dimension size by computing the embeddings dimensions + self.dimensions = self._get_embedding_size() + try: self.client = MongoClient(connection_string) self.client.admin.command("ping") - logger.info("Successfully created MongoClient") + logger.debug("Successfully created MongoClient") except errors.ServerSelectionTimeoutError as err: raise ConnectionError("Could not connect to MongoDB server") from err self.db = self.client[database_name] - logger.info(f"Atlas Database name: {self.db.name}") + logger.debug(f"Atlas Database name: {self.db.name}") if collection_name: - self.active_collection = self.create_collection(collection_name) + self.active_collection = self.create_collection(collection_name, overwrite=self.overwrite) else: self.active_collection = None - # This will get the model dimension size by computing the embeddings dimensions - sentences = ["The weather is lovely today in paradise."] - embeddings = self.embedding_function(sentences) - self.dimensions = len(embeddings[0]) - # MongoDB Atlas Search Index - self.index_name = index_name + + def _is_index_ready(self, collection: Collection, index_name: str): + """Check for the index name in the list of available search indexes to see if the + specified index is of status READY + + Args: + collection (Collection): MongoDB Collection to for the search indexes + index_name (str): Vector Search Index name + + Returns: + bool : True if the index is present and READY false otherwise + """ + for index in collection.list_search_indexes(index_name): + if index["type"] == "vectorSearch" and index["status"] == "READY": + return True + return False + + def _wait_for_index(self, collection: Collection, index_name: str, timeout=_TIMEOUT): + """Waits up to 20 seconds for the index to be created to be ready, otherwise + throws a TimeoutError""" + start = monotonic() + while monotonic() - start < timeout: + if self._is_index_ready(collection, index_name): + return + sleep(_DELAY) + raise TimeoutError(f"Index {self.index_name} is not ready!") + + def _get_embedding_size(self): + return len(self.embedding_function(_SAMPLE_SENTENCE)[0]) def list_collections(self): """ @@ -78,70 +115,39 @@ def create_collection( self, collection_name: str, overwrite: bool = False, - get_or_create: bool = True, ) -> Collection: """ Create a collection in the vector database and create a vector search index in the collection. + If collection already exists, return the existing collection. Args: collection_name: str | The name of the collection. overwrite: bool | Whether to overwrite the collection if it exists. Default is False. - get_or_create: bool | Whether to get the collection if it exists. Default is True """ - # if overwrite is False and get_or_create is False, raise a ValueError - if not overwrite and not get_or_create: - raise ValueError("If overwrite is False, get_or_create must be True.") + collection_exists = collection_name in self.db.list_collection_names() - collection_names = self.db.list_collection_names() - if collection_name not in collection_names: + if collection_exists: # Create a new collection - coll = self.db.create_collection(collection_name) - self.create_index_if_not_exists(index_name=self.index_name, collection=coll) - return coll - if overwrite: - self.db.drop_collection(collection_name) - coll = self.db.create_collection(collection_name) - self.create_index_if_not_exists(index_name=self.index_name, collection=coll) - return coll - if get_or_create: - # The collection already exists, return it. coll = self.db[collection_name] - self.create_index_if_not_exists(index_name=self.index_name, collection=coll) - return coll + if overwrite: + self.db.drop_collection(collection_name) + coll = self.db.create_collection(collection_name) else: - # get_or_create is False and the collection already exists, raise an error. - raise ValueError(f"Collection {collection_name} already exists.") + coll = self.db.create_collection(collection_name) + + self.create_index_if_not_exists(index_name=self.index_name, collection=coll) + return coll - def create_index_if_not_exists(self, index_name: str = "vector_index", collection: Collection = None): + def create_index_if_not_exists(self, index_name: str = "vector_index", collection: Collection = None) -> None: """ Creates a vector search index on the specified collection in MongoDB. Args: MONGODB_INDEX (str, optional): The name of the vector search index to create. Defaults to "vector_search_index". collection (Collection, optional): The MongoDB collection to create the index on. Defaults to None. - - Returns: - bool: True if the index was successfully created, False otherwise. """ - success = False - # Check if the index already exists - if index_name not in collection.list_search_indexes(): - # Define retry logic with exponential backoff - retries = 3 - delay = 3 - while retries and not success: - try: - # Attempt to create the vector search index - self.create_vector_search_index(collection, index_name) - success = True - except OperationFailure: - # Handle potential operation failure - retries -= 1 - sleep(delay) - delay *= 2 # Increase delay for next retry - else: # index exists - success = True - return success + if not self._is_index_ready(collection, index_name): + self.create_vector_search_index(collection, index_name) def get_collection(self, collection_name: str = None) -> Collection: """ @@ -209,16 +215,9 @@ def create_vector_search_index( # Create the search index try: collection.create_search_index(model=search_index_model) - # Wait for the index to be created - keep_trying = True - while keep_trying: - indexes = collection.list_search_indexes() - for index in indexes: - if index["name"] == index_name and index["status"] == "READY": - keep_trying = False - else: - sleep(2) # 2s delay between checks - logger.info(f"Search index {index_name} created successfully.") + if self._wait_until_ready: + self._wait_for_index(collection, index_name) + logger.debug(f"Search index {index_name} created successfully.") except Exception as e: logger.error( f"Error creating search index: {e}. \n" @@ -231,7 +230,7 @@ def create_vector_search_index( def upsert_docs(self, docs, collection): for doc in docs: query = {"id": doc["id"]} - doc["embedding"] = np.array(self.embedding_function([doc["content"]])).tolist()[0] + doc["embedding"] = np.array(self.embedding_function([doc["content"]])).tolist()[0] new_values = {"$set": doc} collection.update_one(query, new_values, upsert=True) @@ -367,9 +366,12 @@ def update_docs(self, docs: List[Document], collection_name: str = None, **kwarg result = collection.bulk_write(all_updates) # Log a result summary - logger.info(f"Matched: {result.matched_count}") - logger.info(f"Modified: {result.modified_count}") - logger.info(f"Upserted: {result.upserted_count}") + logger.info( + "Matched: %s, Modified: %s, Upserted: %s", + result.matched_count, + result.modified_count, + result.upserted_count, + ) def delete_docs(self, ids: List[ItemID], collection_name: str = None, **kwargs): """ @@ -430,10 +432,12 @@ def retrieve_docs( n_results: int | The number of relevant documents to return. Default is 10. distance_threshold: float | The threshold for the distance score, only distance smaller than it will be returned. Don't filter with it if < 0. Default is -1. + wait_until_ready: bool | Will not execute the retrieval operation until the specified vector index is + ready to be queried. Defaults is false. kwargs: Dict | Additional keyword arguments. Ones of importance follow: oversampling_factor: int | This times n_results is 'ef' in the HNSW algorithm. It determines the number of nearest neighbor candidates to consider during the search phase. - A higher value leads to more accuracy, but is slower. Default = 10 + A higher value leads to more accuracy, but is slower. Default is 10 Returns: QueryResults | For each query string, a list of nearest documents and their scores. @@ -443,19 +447,13 @@ def retrieve_docs( if collection.count_documents({}) == 0: return [] - # Ensure that there is at least one search index - search_indexes = list(collection.list_search_indexes()) - assert len(search_indexes), f"There are no search indexes for {collection.name}" # Check status of index! - for index in search_indexes: - if index["name"] == self.index_name and index["type"] == "vectorSearch" and index["status"] != "READY": - raise Exception(f"Index {self.index_name} is not ready!") - logger.info(f"Using index: {str(list(search_indexes))}") + if self._wait_until_ready: + self._wait_for_index(collection, self.index_name) + logger.info(f"Using index: {self.index_name}") results = [] - sleep(15) for query_text in queries: # Compute embedding vector from semantic query - print('query_text', query_text) logger.info(f"Query: {query_text}") query_vector = np.array(self.embedding_function([query_text])).tolist()[0] # Find documents with similar vectors using the specified index @@ -512,7 +510,6 @@ def _vector_search( }, {"$set": {"score": {"$meta": "vectorSearchScore"}}}, ] - print("pipeline: ", pipeline) logger.info("pipeline: %s", pipeline) if distance_threshold >= 0.0: similarity_threshold = 1.0 - distance_threshold diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 0c703cb9175e..fad514b492af 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -1,6 +1,6 @@ import logging import os -from time import sleep +from time import monotonic, sleep from typing import List import pytest @@ -19,29 +19,60 @@ pytest.skip(allow_module_level=True) from pymongo.collection import Collection -from pymongo.errors import OperationFailure logger = logging.getLogger(__name__) -MONGODB_URI = os.environ.get("MONGODB_URI", "mongodb://localhost:27017/?directConnection=true") +MONGODB_URI = os.environ.get("MONGODB_URI", "mongodb://localhost:64684/?directConnection=true") MONGODB_DATABASE = os.environ.get("DATABASE", "autogen_test_db") MONGODB_COLLECTION = os.environ.get("MONGODB_COLLECTION", "autogen_test_vectorstore") MONGODB_INDEX = os.environ.get("MONGODB_INDEX", "vector_index") RETRIES = 10 DELAY = 2 +TIMEOUT = 20.0 + + +def _wait_for_predicate(predicate, err, timeout=TIMEOUT, interval=DELAY): + """Generic to block until the predicate returns true + + Args: + predicate (Callable[, bool]): A function that returns a boolean value + err (str): Error message to raise if nothing occurs + timeout (float, optional): Length of time to wait for predicate. Defaults to TIMEOUT. + interval (float, optional): Interval to check predicate. Defaults to DELAY. + + Raises: + TimeoutError: _description_ + """ + start = monotonic() + while not predicate(): + if monotonic() - start > TIMEOUT: + raise TimeoutError(err) + sleep(DELAY) + + +def _delete_collections(database): + """Delete all collections within the database + + Args: + database (pymongo.Database): MongoDB Database Abstraction + """ + for collection_name in database.list_collection_names(): + database[collection_name].drop() + _wait_for_predicate(lambda: not database.list_collection_names(), "Not all collections deleted") @pytest.fixture def db(): """VectorDB setup and teardown, including collections and search indexes""" - vectorstore = MongoDBAtlasVectorDB(connection_string=MONGODB_URI, database_name=MONGODB_DATABASE) - vectorstore.delete_collection(MONGODB_COLLECTION) + vectorstore = MongoDBAtlasVectorDB( + connection_string=MONGODB_URI, + database_name=MONGODB_DATABASE, + wait_until_ready=True, + overwrite=True, + ) yield vectorstore - for c in vectorstore.db.list_collection_names(): - clxn = vectorstore.get_collection(c) - clxn.drop() - sleep(20) # Provide time for resync of db and search services. + _delete_collections(vectorstore.db) @pytest.fixture @@ -56,62 +87,45 @@ def example_documents() -> List[Document]: @pytest.fixture -def db_with_indexed_clxn(db): - """Convenient when we wish to de-emphasize setup. - - We provide wait and retry method when running these quick integration tests. - """ - collection = db.create_collection(MONGODB_COLLECTION) - if MONGODB_INDEX not in collection.list_search_indexes(): - retries = 3 - delay = 3 - success = False - while retries and not success: - try: - db.create_vector_search_index(collection, MONGODB_INDEX) - success = True - except OperationFailure: - retries -= 1 - sleep(delay) - return db, collection +def db_with_indexed_clxn(): + """VectorDB with a collection created immediately""" + vectorstore = MongoDBAtlasVectorDB( + connection_string=MONGODB_URI, + database_name=MONGODB_DATABASE, + wait_until_ready=True, + collection_name=MONGODB_COLLECTION, + overwrite=True, + ) + yield vectorstore, vectorstore.db[MONGODB_COLLECTION] + _delete_collections(vectorstore.db) def test_create_collection(db): """ def create_collection(collection_name: str, - overwrite: bool = False, - get_or_create: bool = True) -> Any + overwrite: bool = False) -> Collection Create a collection in the vector database. - Case 1. if the collection does not exist, create the collection. - Case 2. the collection exists, if overwrite is True, it will overwrite the collection. - - Case 3. the collection exists and overwrite is False, if get_or_create is True, it will get the collection, otherwise it raise a ValueError. + - Case 3. the collection exists and overwrite is False return the existing collection. """ collection_name = "test_collection" - # test_create_collection: case 1 - collection = db.create_collection( + collection_case_1 = db.create_collection( collection_name=collection_name, ) - if collection_name not in db.list_collections(): - assert collection.name == collection_name + assert collection_case_1.name == collection_name - # test_create_collection: case 2 - # test overwrite=True - collection = db.create_collection( + collection_case_2 = db.create_collection( collection_name=collection_name, overwrite=True, - get_or_create=True, ) - assert collection.name == collection_name + assert collection_case_2.name == collection_name - # test_create_collection: case 3 - # test overwrite=False - # test get_or_create=False - with pytest.raises(ValueError): - collection = db.create_collection(collection_name, overwrite=False, get_or_create=False) - # test get_or_create=True - collection = db.create_collection(collection_name, overwrite=False, get_or_create=True) - assert collection.name == collection_name + collection_case_3 = db.create_collection( + collection_name=collection_name, + ) + assert collection_case_3.name == collection_name def test_get_collection(db): @@ -149,9 +163,6 @@ def test_insert_docs(db, example_documents): # Create a collection db.delete_collection(MONGODB_COLLECTION) collection = db.create_collection(MONGODB_COLLECTION) - # Create a search index - if MONGODB_INDEX not in collection.list_search_indexes(): - db.create_vector_search_index(collection, MONGODB_INDEX) # Insert example documents db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) @@ -199,18 +210,17 @@ def test_update_docs(db_with_indexed_clxn, example_documents): def test_delete_docs(db_with_indexed_clxn, example_documents): - db, collection = db_with_indexed_clxn + db, clxn = db_with_indexed_clxn # Insert example documents db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) # Delete the 1s db.delete_docs(ids=[1, "1"], collection_name=MONGODB_COLLECTION) # Confirm just the 2s remain - clxn = db.get_collection(MONGODB_COLLECTION) assert {2, "2"} == {doc["_id"] for doc in clxn.find({})} def test_get_docs_by_ids(db_with_indexed_clxn, example_documents): - db, collection = db_with_indexed_clxn + db, _ = db_with_indexed_clxn # Insert example documents db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) @@ -230,38 +240,30 @@ def test_get_docs_by_ids(db_with_indexed_clxn, example_documents): assert len(docs) == 0 -def test_retrieve_docs(db, example_documents): - # Create collection - db.delete_collection(MONGODB_COLLECTION) - collection = db.get_collection(MONGODB_COLLECTION) +def test_retrieve_docs(db_with_indexed_clxn, example_documents): + db, _ = db_with_indexed_clxn # Sanity test. Retrieving docs before documents have been added - results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) - assert results == [] + assert db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) == [] # Insert example documents db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) - # Sanity test. Retrieving docs before the search index had been created - db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) - # Create the index - db.create_vector_search_index(collection=collection, index_name=MONGODB_INDEX) - # Begin testing Atlas Vector Search # NOTE: Indexing may take some time, so we must be patient on the first query. + # We have the wait_until_ready flag to ensure index is created and ready # Immediately adding documents and then querying is only standard for testing n_results = 2 # Number of closest docs to return success = False - retries = RETRIES - while retries and not success: + start = monotonic() + while monotonic() - start < TIMEOUT: results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) if len(results[0]) == n_results: success = True + break else: - retries -= 1 sleep(DELAY) - if not success: - raise OperationFailure(f"Failed to retrieve docs after {RETRIES} retries, waiting {DELAY} seconds after each.") + assert success, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each." assert {doc[0]["id"] for doc in results[0]} == {1, 2} @@ -276,8 +278,3 @@ def test_retrieve_docs(db, example_documents): assert all([len(res) == n_results for res in results]) assert {doc[0]["id"] for doc in results[0]} == {1, 2} assert {doc[0]["id"] for doc in results[1]} == {"1", "2"} - - -def test_search_indexes(db): - pass - # TODO From 7d778fe1253cd4607b882688f40521720645e339 Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 18 Jul 2024 17:09:26 -0400 Subject: [PATCH 17/33] Add tests for score_threshold, embedding inclusion, and multiple query tests --- autogen/agentchat/contrib/vectordb/mongodb.py | 17 ++- .../contrib/vectordb/test_mongodb.py | 103 ++++++++++++++---- 2 files changed, 96 insertions(+), 24 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 126482f53a13..f6ccbe458cce 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -1,10 +1,12 @@ from copy import deepcopy +from importlib.metadata import version from time import monotonic, sleep from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Set, Tuple, Union import numpy as np from pymongo import MongoClient, UpdateOne, errors from pymongo.collection import Collection +from pymongo.driver_info import DriverInfo from pymongo.operations import SearchIndexModel from sentence_transformers import SentenceTransformer @@ -60,7 +62,7 @@ def __init__( self.dimensions = self._get_embedding_size() try: - self.client = MongoClient(connection_string) + self.client = MongoClient(connection_string, driver=DriverInfo(name="autogen")) self.client.admin.command("ping") logger.debug("Successfully created MongoClient") except errors.ServerSelectionTimeoutError as err: @@ -463,7 +465,8 @@ def retrieve_docs( collection, self.index_name, distance_threshold, - kwargs.get("oversampling_factor", 10), + **kwargs, + oversampling_factor=kwargs.get("oversampling_factor", 10), ) # Change each _id key to id. with_id_rename, but with (doc, score) tuples results.append( @@ -479,6 +482,7 @@ def _vector_search( index_name: str, distance_threshold: float = -1.0, oversampling_factor=10, + include_embedding=False, ) -> List[Tuple[Dict, float]]: """Core $vectorSearch Aggregation pipeline. @@ -488,7 +492,7 @@ def _vector_search( collection: MongoDB Collection with vector index index_name: Name of the vector index distance_threshold: Only distance measures smaller than this will be returned. - Don't filter with it if < 0. Default is -1. + Don't filter with it if 1 < x < 0. Default is -1. oversampling_factor: int | This times n_results is 'ef' in the HNSW algorithm. It determines the number of nearest neighbor candidates to consider during the search phase. A higher value leads to more accuracy, but is slower. Default = 10 @@ -510,10 +514,13 @@ def _vector_search( }, {"$set": {"score": {"$meta": "vectorSearchScore"}}}, ] - logger.info("pipeline: %s", pipeline) if distance_threshold >= 0.0: similarity_threshold = 1.0 - distance_threshold - pipeline.append({"$match": {"score": {"gte": similarity_threshold}}}) + pipeline.append({"$match": {"score": {"$gte": similarity_threshold}}}) + + if not include_embedding: + pipeline.append({"$project": {"embedding": 0}}) + logger.info("pipeline: %s", pipeline) agg = collection.aggregate(pipeline) return [(doc, doc.pop("score")) for doc in agg] diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index fad514b492af..14d73ff33c70 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -240,41 +240,106 @@ def test_get_docs_by_ids(db_with_indexed_clxn, example_documents): assert len(docs) == 0 -def test_retrieve_docs(db_with_indexed_clxn, example_documents): +def test_retrieve_docs_empty(db_with_indexed_clxn): db, _ = db_with_indexed_clxn - # Sanity test. Retrieving docs before documents have been added assert db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) == [] + + +def test_retrieve_docs_populated_db_empty_query(db_with_indexed_clxn, example_documents): + db, _ = db_with_indexed_clxn + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + # Empty list of queries returns empty list of results + results = db.retrieve_docs(queries=[], collection_name=MONGODB_COLLECTION, n_results=2) + assert results == [] + + +def test_retrieve_docs(db_with_indexed_clxn, example_documents): + """Begin testing Atlas Vector Search + NOTE: Indexing may take some time, so we must be patient on the first query. + We have the wait_until_ready flag to ensure index is created and ready + Immediately adding documents and then querying is only standard for testing + """ + db, _ = db_with_indexed_clxn # Insert example documents db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) - # Begin testing Atlas Vector Search - # NOTE: Indexing may take some time, so we must be patient on the first query. - # We have the wait_until_ready flag to ensure index is created and ready - # Immediately adding documents and then querying is only standard for testing + n_results = 2 # Number of closest docs to return + + def results_ready(): + results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) + return len(results[0]) == n_results + + _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") + + results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) + assert {doc[0]["id"] for doc in results[0]} == {1, 2} + + +def test_retrieve_docs_with_embedding(db_with_indexed_clxn, example_documents): + """Begin testing Atlas Vector Search + NOTE: Indexing may take some time, so we must be patient on the first query. + We have the wait_until_ready flag to ensure index is created and ready + Immediately adding documents and then querying is only standard for testing + """ + db, _ = db_with_indexed_clxn + # Insert example documents + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) n_results = 2 # Number of closest docs to return - success = False - start = monotonic() - while monotonic() - start < TIMEOUT: + def results_ready(): results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) - if len(results[0]) == n_results: - success = True - break - else: - sleep(DELAY) - assert success, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each." + return len(results[0]) == n_results + _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") + + results = db.retrieve_docs( + queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results, include_embedding=True + ) assert {doc[0]["id"] for doc in results[0]} == {1, 2} + assert all(["embedding" in doc[0] for doc in results[0]]) - # Empty list of queries returns empty list of results - results = db.retrieve_docs(queries=[], collection_name=MONGODB_COLLECTION, n_results=n_results) - assert results == [] - # Empty list of queries returns empty list of results +def test_retrieve_docs_multiple_queries(db_with_indexed_clxn, example_documents): + db, _ = db_with_indexed_clxn + # Insert example documents + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + n_results = 2 # Number of closest docs to return + queries = ["Some good pets", "What kind of Sandwich?"] + + def results_ready(): + results = db.retrieve_docs(queries=queries, collection_name=MONGODB_COLLECTION, n_results=n_results) + return all([len(res) == n_results for res in results]) + + _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") + results = db.retrieve_docs(queries=queries, collection_name=MONGODB_COLLECTION, n_results=2) + assert len(results) == len(queries) assert all([len(res) == n_results for res in results]) assert {doc[0]["id"] for doc in results[0]} == {1, 2} assert {doc[0]["id"] for doc in results[1]} == {"1", "2"} + + +def test_retrieve_docs_with_threshold(db_with_indexed_clxn, example_documents): + db, _ = db_with_indexed_clxn + # Insert example documents + db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + + n_results = 2 # Number of closest docs to return + queries = ["Cats"] + + def results_ready(): + results = db.retrieve_docs(queries=queries, collection_name=MONGODB_COLLECTION, n_results=n_results) + return len(results[0]) == n_results + + _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") + + # Distance Threshold of .3 means that the score must be .7 or greater + # only one result should be that value + results = db.retrieve_docs( + queries=queries, collection_name=MONGODB_COLLECTION, n_results=n_results, distance_threshold=0.3 + ) + assert len(results[0]) == 1 + assert all([doc[1] >= 0.7 for doc in results[0]]) From 0921c532a530cf900dfd5a5ac0d1e3212eb55874 Mon Sep 17 00:00:00 2001 From: Jib Date: Fri, 19 Jul 2024 10:36:26 -0400 Subject: [PATCH 18/33] refactored create_collection to meet base object requirements --- autogen/agentchat/contrib/vectordb/base.py | 2 -- autogen/agentchat/contrib/vectordb/mongodb.py | 26 +++++++++++-------- .../contrib/vectordb/test_mongodb.py | 8 ++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/base.py b/autogen/agentchat/contrib/vectordb/base.py index d0bb0f9351ba..d7d49d6200ca 100644 --- a/autogen/agentchat/contrib/vectordb/base.py +++ b/autogen/agentchat/contrib/vectordb/base.py @@ -201,9 +201,7 @@ class VectorDBFactory: Factory class for creating vector databases. """ - PREDEFINED_VECTOR_DB = ["chroma", "pgvector", "mongodb", "qdrant"] - PREDEFINED_VECTOR_DB = ["chroma", "pgvector", "qdrant"] @staticmethod def create_vector_db(db_type: str, **kwargs) -> VectorDB: diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index f6ccbe458cce..728f99827768 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -1,5 +1,4 @@ from copy import deepcopy -from importlib.metadata import version from time import monotonic, sleep from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Set, Tuple, Union @@ -117,28 +116,33 @@ def create_collection( self, collection_name: str, overwrite: bool = False, + get_or_create: bool = True, ) -> Collection: """ Create a collection in the vector database and create a vector search index in the collection. - If collection already exists, return the existing collection. Args: collection_name: str | The name of the collection. overwrite: bool | Whether to overwrite the collection if it exists. Default is False. + get_or_create: bool | Whether to get or create the collection. Default is True """ - collection_exists = collection_name in self.db.list_collection_names() + if overwrite: + self.db.drop_collection(collection_name) - if collection_exists: + if collection_name not in self.db.list_collection_names(): # Create a new collection - coll = self.db[collection_name] - if overwrite: - self.db.drop_collection(collection_name) - coll = self.db.create_collection(collection_name) - else: coll = self.db.create_collection(collection_name) + self.create_index_if_not_exists(index_name=self.index_name, collection=coll) + return coll - self.create_index_if_not_exists(index_name=self.index_name, collection=coll) - return coll + if get_or_create: + # The collection already exists, return it. + coll = self.db[collection_name] + self.create_index_if_not_exists(index_name=self.index_name, collection=coll) + return coll + else: + # get_or_create is False and the collection already exists, raise an error. + raise ValueError(f"Collection {collection_name} already exists.") def create_index_if_not_exists(self, index_name: str = "vector_index", collection: Collection = None) -> None: """ diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 14d73ff33c70..c80b46fab85f 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -108,6 +108,7 @@ def create_collection(collection_name: str, - Case 1. if the collection does not exist, create the collection. - Case 2. the collection exists, if overwrite is True, it will overwrite the collection. - Case 3. the collection exists and overwrite is False return the existing collection. + - Case 4. the collection exists and overwrite is False and get_or_create is False, raise a ValueError """ collection_name = "test_collection" @@ -127,6 +128,13 @@ def create_collection(collection_name: str, ) assert collection_case_3.name == collection_name + with pytest.raises(ValueError): + db.create_collection( + collection_name=collection_name, + overwrite=False, + get_or_create=False + ) + def test_get_collection(db): collection_name = MONGODB_COLLECTION From 01f96c7e61cf04031c894ac77971ace0661ad6a9 Mon Sep 17 00:00:00 2001 From: Jib Date: Fri, 19 Jul 2024 10:58:32 -0400 Subject: [PATCH 19/33] lint --- notebook/agentchat_mongodb_RetrieveChat.ipynb | 2 +- test/agentchat/contrib/vectordb/test_mongodb.py | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/notebook/agentchat_mongodb_RetrieveChat.ipynb b/notebook/agentchat_mongodb_RetrieveChat.ipynb index f1e696fe6e98..3107df3a76f1 100644 --- a/notebook/agentchat_mongodb_RetrieveChat.ipynb +++ b/notebook/agentchat_mongodb_RetrieveChat.ipynb @@ -180,7 +180,7 @@ " \"db_config\": {\n", " \"connection_string\": \"\", # MongoDB Atlas connection string\n", " \"database_name\": \"\", # MongoDB Atlas database\n", - " \"index_name\":\"vector_index\",\n", + " \"index_name\": \"vector_index\",\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index c80b46fab85f..d16c33a9a347 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -129,11 +129,7 @@ def create_collection(collection_name: str, assert collection_case_3.name == collection_name with pytest.raises(ValueError): - db.create_collection( - collection_name=collection_name, - overwrite=False, - get_or_create=False - ) + db.create_collection(collection_name=collection_name, overwrite=False, get_or_create=False) def test_get_collection(db): From 311259e9af2c2a2a08442da325a487df100f129e Mon Sep 17 00:00:00 2001 From: Jib Date: Fri, 19 Jul 2024 11:04:31 -0400 Subject: [PATCH 20/33] change the localhost port to 27017 --- test/agentchat/contrib/vectordb/test_mongodb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index d16c33a9a347..c81e664455ee 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) -MONGODB_URI = os.environ.get("MONGODB_URI", "mongodb://localhost:64684/?directConnection=true") +MONGODB_URI = os.environ.get("MONGODB_URI", "mongodb://localhost:27017/?directConnection=true") MONGODB_DATABASE = os.environ.get("DATABASE", "autogen_test_db") MONGODB_COLLECTION = os.environ.get("MONGODB_COLLECTION", "autogen_test_vectorstore") MONGODB_INDEX = os.environ.get("MONGODB_INDEX", "vector_index") From cf97466058cf6094839554100b3c9c6a45c04d69 Mon Sep 17 00:00:00 2001 From: Jib Date: Fri, 19 Jul 2024 11:06:54 -0400 Subject: [PATCH 21/33] add test to check that no embedding is there unless explicitly provided --- test/agentchat/contrib/vectordb/test_mongodb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index c81e664455ee..1588808191f0 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -277,6 +277,7 @@ def results_ready(): results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) assert {doc[0]["id"] for doc in results[0]} == {1, 2} + assert all(["embedding" not in doc[0] for doc in results[0]]) def test_retrieve_docs_with_embedding(db_with_indexed_clxn, example_documents): From 8491d5aa71ea37594aeb49ccf177d5a9f1a62e88 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 21 Jul 2024 21:43:07 +0800 Subject: [PATCH 22/33] Update logger --- autogen/agentchat/contrib/vectordb/mongodb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 728f99827768..c45653fd729a 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -525,6 +525,6 @@ def _vector_search( if not include_embedding: pipeline.append({"$project": {"embedding": 0}}) - logger.info("pipeline: %s", pipeline) + logger.debug("pipeline: %s", pipeline) agg = collection.aggregate(pipeline) return [(doc, doc.pop("score")) for doc in agg] From 1b41e18d182a32d63a16384ff45477e0e93e24e2 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 21 Jul 2024 21:43:54 +0800 Subject: [PATCH 23/33] Add test get docs with ids=None --- test/agentchat/contrib/vectordb/test_mongodb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 1588808191f0..4b7b222327b3 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -243,6 +243,10 @@ def test_get_docs_by_ids(db_with_indexed_clxn, example_documents): docs = db.get_docs_by_ids(ids=[], include=["content"], collection_name=MONGODB_COLLECTION) assert len(docs) == 0 + # Test with empty ids list + docs = db.get_docs_by_ids(ids=None, include=["content"], collection_name=MONGODB_COLLECTION) + assert len(docs) == 4 + def test_retrieve_docs_empty(db_with_indexed_clxn): db, _ = db_with_indexed_clxn From 14776e4f5ccaad00b056a2861945fef0b4df56b7 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 21 Jul 2024 22:01:46 +0800 Subject: [PATCH 24/33] Rename and update notebook --- notebook/agentchat_RetrieveChat_mongodb.ipynb | 269 +++++++ notebook/agentchat_mongodb_RetrieveChat.ipynb | 680 ------------------ 2 files changed, 269 insertions(+), 680 deletions(-) create mode 100644 notebook/agentchat_RetrieveChat_mongodb.ipynb delete mode 100644 notebook/agentchat_mongodb_RetrieveChat.ipynb diff --git a/notebook/agentchat_RetrieveChat_mongodb.ipynb b/notebook/agentchat_RetrieveChat_mongodb.ipynb new file mode 100644 index 000000000000..9c128bffbeab --- /dev/null +++ b/notebook/agentchat_RetrieveChat_mongodb.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using RetrieveChat Powered by MongoDB Atlas for Retrieve Augmented Code Generation and Question Answering\n", + "\n", + "AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation.\n", + "Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n", + "\n", + "RetrieveChat is a conversational system for retrieval-augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `RetrieveAssistantAgent` and `RetrieveUserProxyAgent` implement a different auto-reply mechanism corresponding to the RetrieveChat prompts.\n", + "\n", + "## Table of Contents\n", + "We'll demonstrate six examples of using RetrieveChat for code generation and question answering:\n", + "\n", + "- [Example 1: Generate code based off docstrings w/o human feedback](#example-1)\n", + "\n", + "````{=mdx}\n", + ":::info Requirements\n", + "Some extra dependencies are needed for this notebook, which can be installed via pip:\n", + "\n", + "```bash\n", + "pip install pyautogen[retrievechat-mongodb] flaml[automl]\n", + "```\n", + "\n", + "For more information, please refer to the [installation guide](/docs/installation/).\n", + ":::\n", + "````\n", + "\n", + "Ensure you have a MongoDB Atlas instance with Cluster Tier >= M30." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set your API Endpoint\n", + "\n", + "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "models to use: ['gpt4-1106-preview', 'gpt-4o', 'gpt-35-turbo', 'gpt-35-turbo-0613']\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "\n", + "import autogen\n", + "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n", + "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", + "\n", + "# Accepted file formats for that can be stored in\n", + "# a vector database instance\n", + "from autogen.retrieve_utils import TEXT_FORMATS\n", + "\n", + "config_list = autogen.config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\", file_location=\".\")\n", + "assert len(config_list) > 0\n", + "print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "````{=mdx}\n", + ":::tip\n", + "Learn more about configuring LLMs for agents [here](/docs/topics/llm_configuration).\n", + ":::\n", + "````\n", + "\n", + "## Construct agents for RetrieveChat\n", + "\n", + "We start by initializing the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for RetrieveAssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.message_generator` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accepted file formats for `docs_path`:\n", + "['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n" + ] + } + ], + "source": [ + "print(\"Accepted file formats for `docs_path`:\")\n", + "print(TEXT_FORMATS)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/lijiang1/anaconda3/envs/pr2942/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from tqdm.autonotebook import tqdm, trange\n" + ] + } + ], + "source": [ + "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", + "assistant = RetrieveAssistantAgent(\n", + " name=\"assistant\",\n", + " system_message=\"You are a helpful assistant.\",\n", + " llm_config={\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 42,\n", + " \"config_list\": config_list,\n", + " },\n", + ")\n", + "\n", + "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n", + "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n", + "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n", + "# it is set to None, which works only if the collection is already created.\n", + "# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n", + "# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n", + "# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.\n", + "# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.\n", + "# In this example, we set it to [\"non-existent-type\"] to only process markdown files. Since no \"non-existent-type\" files are included in the `websit/docs`,\n", + "# no files there will be processed. However, the explicitly included urls will still be processed.\n", + "ragproxyagent = RetrieveUserProxyAgent(\n", + " name=\"ragproxyagent\",\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=3,\n", + " retrieve_config={\n", + " \"task\": \"code\",\n", + " \"docs_path\": [\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n", + " os.path.join(os.path.abspath(\"\"), \"..\", \"website\", \"docs\"),\n", + " ],\n", + " \"custom_text_types\": [\"non-existent-type\"],\n", + " \"chunk_token_size\": 2000,\n", + " \"model\": config_list[0][\"model\"],\n", + " \"vector_db\": \"mongodb\", # MongoDB Atlas database\n", + " \"collection_name\": \"demo_collection\",\n", + " \"db_config\": {\n", + " \"connection_string\": \"\", # MongoDB Atlas connection string\n", + " \"database_name\": \"test_db\", # MongoDB Atlas database\n", + " \"index_name\": \"vector_index\",\n", + " },\n", + " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", + " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", + " },\n", + " code_execution_config=False, # set to False if you don't want to execute the code\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 1\n", + "\n", + "[Back to top](#table-of-contents)\n", + "\n", + "Use RetrieveChat to help generate sample code and automatically run the code and fix errors if there is any.\n", + "\n", + "Problem: Which API should I use if I want to use FLAML for a classification task and I want to train the model in 30 seconds. Use spark to parallel the training. Force cancel jobs if time limit is reached." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to create collection.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-21 21:50:16,476 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "2024-07-21 21:50:16,532 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: vector_index\u001b[0m\n", + "2024-07-21 21:50:16,533 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VectorDB returns doc_ids: [[]]\n", + "\u001b[32mNo more context, will terminate.\u001b[0m\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "# reset the assistant. Always reset the assistant before starting a new conversation.\n", + "assistant.reset()\n", + "\n", + "# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.\n", + "# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.\n", + "# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.\n", + "# With human-in-loop, the conversation will continue until the user says \"exit\".\n", + "code_problem = \"How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\"\n", + "chat_result = ragproxyagent.initiate_chat(assistant, message=ragproxyagent.message_generator, problem=code_problem)" + ] + } + ], + "metadata": { + "front_matter": { + "description": "Explore the use of AutoGen's RetrieveChat for tasks like code generation from docstrings, answering complex questions with human feedback, and exploiting features like Update Context, custom prompts, and few-shot learning.", + "tags": [ + "RAG" + ] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "skip_test": "Requires interactive usage" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebook/agentchat_mongodb_RetrieveChat.ipynb b/notebook/agentchat_mongodb_RetrieveChat.ipynb deleted file mode 100644 index 3107df3a76f1..000000000000 --- a/notebook/agentchat_mongodb_RetrieveChat.ipynb +++ /dev/null @@ -1,680 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using RetrieveChat Powered by MongoDB Atlas for Retrieve Augmented Code Generation and Question Answering\n", - "\n", - "AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation.\n", - "Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n", - "\n", - "RetrieveChat is a conversational system for retrieval-augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `RetrieveAssistantAgent` and `RetrieveUserProxyAgent` implement a different auto-reply mechanism corresponding to the RetrieveChat prompts.\n", - "\n", - "## Table of Contents\n", - "We'll demonstrate six examples of using RetrieveChat for code generation and question answering:\n", - "\n", - "- [Example 1: Generate code based off docstrings w/o human feedback](#example-1)\n", - "\n", - "````{=mdx}\n", - ":::info Requirements\n", - "Some extra dependencies are needed for this notebook, which can be installed via pip:\n", - "\n", - "```bash\n", - "pip install pyautogen[retrievechat-mongodb] flaml[automl]\n", - "```\n", - "\n", - "For more information, please refer to the [installation guide](/docs/installation/).\n", - ":::\n", - "````\n", - "\n", - "Ensure you have a MongoDB Atlas instance.\n", - "\n", - "If not, a test version can quickly be deployed using Docker.\n", - "\n", - "`docker-compose.yml`\n", - "\n", - "```yml\n", - "version: '3.9'\n", - "\n", - "services:\n", - " mongodb:\n", - " image: mongodb/mongodb-atlas-local:latest\n", - " restart: unless-stopped\n", - " ports:\n", - " - \"27017:27017\"\n", - " environment:\n", - " MONGODB_INITDB_ROOT_USERNAME: mongodb_user\n", - " MONGODB_INITDB_ROOT_PASSWORD: mongodb_password\n", - "```\n", - "\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set your API Endpoint\n", - "\n", - "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "models to use: ['gpt-35-turbo']\n" - ] - } - ], - "source": [ - "import json\n", - "import os\n", - "\n", - "import autogen\n", - "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n", - "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", - "\n", - "# Accepted file formats for that can be stored in\n", - "# a vector database instance\n", - "from autogen.retrieve_utils import TEXT_FORMATS\n", - "\n", - "config_list = [\n", - " {\n", - " \"model\": \"gpt-35-turbo\",\n", - " \"base_url\": \"\",\n", - " \"api_type\": \"azure\",\n", - " \"api_version\": \"2023-07-01-preview\",\n", - " \"api_key\": \"\",\n", - " },\n", - "]\n", - "assert len(config_list) > 0\n", - "print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "````{=mdx}\n", - ":::tip\n", - "Learn more about configuring LLMs for agents [here](/docs/topics/llm_configuration).\n", - ":::\n", - "````\n", - "\n", - "## Construct agents for RetrieveChat\n", - "\n", - "We start by initializing the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for RetrieveAssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.message_generator` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accepted file formats for `docs_path`:\n", - "['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n" - ] - } - ], - "source": [ - "print(\"Accepted file formats for `docs_path`:\")\n", - "print(TEXT_FORMATS)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", - "assistant = RetrieveAssistantAgent(\n", - " name=\"assistant\",\n", - " system_message=\"You are a helpful assistant.\",\n", - " llm_config={\n", - " \"timeout\": 600,\n", - " \"cache_seed\": 42,\n", - " \"config_list\": config_list,\n", - " },\n", - ")\n", - "\n", - "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n", - "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n", - "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n", - "# it is set to None, which works only if the collection is already created.\n", - "# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n", - "# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n", - "# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.\n", - "# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.\n", - "# In this example, we set it to [\"non-existent-type\"] to only process markdown files. Since no \"non-existent-type\" files are included in the `websit/docs`,\n", - "# no files there will be processed. However, the explicitly included urls will still be processed.\n", - "ragproxyagent = RetrieveUserProxyAgent(\n", - " name=\"ragproxyagent\",\n", - " human_input_mode=\"NEVER\",\n", - " max_consecutive_auto_reply=3,\n", - " retrieve_config={\n", - " \"task\": \"code\",\n", - " \"docs_path\": [\n", - " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n", - " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n", - " os.path.join(os.path.abspath(\"\"), \"..\", \"website\", \"docs\"),\n", - " ],\n", - " \"custom_text_types\": [\"non-existent-type\"],\n", - " \"chunk_token_size\": 2000,\n", - " \"model\": config_list[0][\"model\"],\n", - " \"vector_db\": \"mongodb\", # MongoDB Atlas database\n", - " \"collection_name\": \"demo_collection\",\n", - " \"db_config\": {\n", - " \"connection_string\": \"\", # MongoDB Atlas connection string\n", - " \"database_name\": \"\", # MongoDB Atlas database\n", - " \"index_name\": \"vector_index\",\n", - " },\n", - " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", - " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", - " },\n", - " code_execution_config=False, # set to False if you don't want to execute the code\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example 1\n", - "\n", - "[Back to top](#table-of-contents)\n", - "\n", - "Use RetrieveChat to help generate sample code and automatically run the code and fix errors if there is any.\n", - "\n", - "Problem: Which API should I use if I want to use FLAML for a classification task and I want to train the model in 30 seconds. Use spark to parallel the training. Force cancel jobs if time limit is reached." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trying to create collection.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-01 08:50:43,934 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Search index vector_index created successfully.\u001b[0m\n", - "2024-07-01 08:50:44,612 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", - "2024-07-01 08:50:45,064 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", - "2024-07-01 08:51:00,069 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", - "2024-07-01 08:51:00,164 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 20, 'numCandidates': 200, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", - "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 20, 'numCandidates': 200, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", - "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n", - "\u001b[32mAdding content of doc 6682a624b9758026836ffab5 to context.\u001b[0m\n", - "\u001b[32mAdding content of doc 6682a624b9758026836ffad5 to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", - "context provided by the user.\n", - "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", - "For code generation, you must obey the following rules:\n", - "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", - "Rule 2. You must follow the formats below to write your code:\n", - "```language\n", - "# your code\n", - "```\n", - "\n", - "User's question is: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", - "\n", - "Context is: # Integrate - Spark\n", - "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", - "\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel spark jobs.\n", - "\n", - "## Spark ML Estimators\n", - "\n", - "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", - "\n", - "### Data\n", - "\n", - "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", - "\n", - "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", - "\n", - "This function also accepts optional arguments `index_col` and `default_index_type`.\n", - "\n", - "- `index_col` is the column name to use as the index, default is None.\n", - "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", - "\n", - "Here is an example code snippet for Spark Data:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "\n", - "# Creating a dictionary\n", - "data = {\n", - " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000],\n", - "}\n", - "\n", - "# Creating a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", - "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", - "\n", - "Here is an example of how to use it:\n", - "\n", - "```python\n", - "from pyspark.ml.feature import VectorAssembler\n", - "\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", - "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", - "\n", - "### Estimators\n", - "\n", - "#### Model List\n", - "\n", - "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", - "\n", - "#### Usage\n", - "\n", - "First, prepare your data in the required format as described in the previous section.\n", - "\n", - "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", - "\n", - "Here is an example code snippet using SparkML models in AutoML:\n", - "\n", - "```python\n", - "import flaml\n", - "\n", - "# prepare your data in pandas-on-spark format as we previously mentioned\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", - " \"task\": \"regression\",\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", - "\n", - "## Parallel Spark Jobs\n", - "\n", - "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", - "\n", - "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", - "\n", - "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", - "\n", - "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", - "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", - "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", - "\n", - "An example code snippet for using parallel Spark jobs:\n", - "\n", - "```python\n", - "import flaml\n", - "\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", - ")\n", - "```\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", - "# Research\n", - "\n", - "For technical details, please check our research publications.\n", - "\n", - "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021flaml,\n", - " title={FLAML: A Fast and Lightweight AutoML Library},\n", - " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", - " year={2021},\n", - " booktitle={MLSys},\n", - "}\n", - "```\n", - "\n", - "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021cfo,\n", - " title={Frugal Optimization for Cost-related Hyperparameters},\n", - " author={Qingyun Wu and Chi Wang and Silu Huang},\n", - " year={2021},\n", - " booktitle={AAAI},\n", - "}\n", - "```\n", - "\n", - "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021blendsearch,\n", - " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", - " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", - " year={2021},\n", - " booktitle={ICLR},\n", - "}\n", - "```\n", - "\n", - "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{liuwang2021hpolm,\n", - " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", - " author={Susan Xueqing Liu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ACL},\n", - "}\n", - "```\n", - "\n", - "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021chacha,\n", - " title={ChaCha for Online AutoML},\n", - " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", - " year={2021},\n", - " booktitle={ICML},\n", - "}\n", - "```\n", - "\n", - "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", - "\n", - "```bibtex\n", - "@inproceedings{wuwang2021fairautoml,\n", - " title={Fair AutoML},\n", - " author={Qingyun Wu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ArXiv preprint arXiv:2111.06495},\n", - "}\n", - "```\n", - "\n", - "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", - "\n", - "```bibtex\n", - "@inproceedings{kayaliwang2022default,\n", - " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", - " author={Moe Kayali and Chi Wang},\n", - " year={2022},\n", - " booktitle={ArXiv preprint arXiv:2202.09927},\n", - "}\n", - "```\n", - "\n", - "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", - "\n", - "```bibtex\n", - "@inproceedings{zhang2023targeted,\n", - " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", - " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", - " booktitle={International Conference on Learning Representations},\n", - " year={2023},\n", - " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", - "}\n", - "```\n", - "\n", - "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2023EcoOptiGen,\n", - " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", - " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2303.04673},\n", - "}\n", - "```\n", - "\n", - "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2023empirical,\n", - " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", - " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2306.01337},\n", - "}\n", - "```\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "You can activate Spark as the parallel backend during parallel tuning in both AutoML and Hyperparameter Tuning, by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using joblib-spark. According to the documentation, to use FLAML with Spark, you need to prepare your data in pandas-on-spark format using the `flaml.automl.spark.utils.to_pandas_on_spark` function in the `flaml.automl.spark.utils` module. Then, you can pass pandas-on-spark data to FLAML as normal data using `dataframe` and `label`. For example, to use SparkML models for regression and train for 30 seconds with force cancel, you can use the following code snippet:\n", - "\n", - "```python\n", - "import flaml\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "\n", - "# load your data into a pandas dataframe\n", - "train_data = ...\n", - "\n", - "psdf = to_pandas_on_spark(train_data)\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True,\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label_name,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "UPDATE CONTEXT. Please provide more information or context for me to assist you better.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-01 08:51:06,587 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", - "2024-07-01 08:51:21,592 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", - "2024-07-01 08:51:21,690 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 60, 'numCandidates': 600, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", - "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 60, 'numCandidates': 600, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", - "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-01 08:51:22,212 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", - "2024-07-01 08:51:37,216 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", - "2024-07-01 08:51:37,286 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 100, 'numCandidates': 1000, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", - "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 100, 'numCandidates': 1000, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", - "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-01 08:51:37,885 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", - "2024-07-01 08:51:52,889 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", - "2024-07-01 08:51:52,975 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 140, 'numCandidates': 1400, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", - "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 140, 'numCandidates': 1400, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", - "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-01 08:51:53,494 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: [{'id': '6682a6042cf0e270602c0fe1', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12, 109000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'numDimensions': 384, 'path': 'embedding', 'similarity': 'cosine'}]}, 'statusDetail': [{'hostname': 'shared-shard-00-search-6xag8e', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}, {'hostname': 'shared-shard-00-search-onamml', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2024, 7, 1, 12, 50, 12)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}}}]}]\u001b[0m\n", - "2024-07-01 08:52:08,496 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n", - "2024-07-01 08:52:08,542 - autogen.agentchat.contrib.vectordb.mongodb - INFO - pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 180, 'numCandidates': 1800, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query_text How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", - "pipeline: [{'$vectorSearch': {'index': 'vector_index', 'limit': 180, 'numCandidates': 1800, 'queryVector': [-0.08256451040506363, -0.07900252193212509, -0.05290786176919937, 0.021982736885547638, 0.046406690031290054, 0.027769701555371284, -0.02768588438630104, -0.020102187991142273, -0.05407266318798065, -0.061684805899858475, -0.03940979018807411, -0.029285598546266556, -0.1118478998541832, -0.03136416897177696, -0.04099257290363312, -0.07897000014781952, -0.02522769570350647, 0.043702732771635056, -0.030820483341813087, -0.041595760732889175, 0.10552595555782318, 0.0023172772489488125, 0.08983399718999863, 0.10865391790866852, -0.06146957352757454, 0.04154617711901665, 0.015428234823048115, 0.016568025574088097, 0.013623313046991825, -0.06059451401233673, 0.08428270369768143, 0.009563339874148369, -0.002620439976453781, 0.016997039318084717, -0.07201018929481506, -0.010901586152613163, -0.030768705531954765, -0.04398634657263756, -0.026716720312833786, -0.019298473373055458, 0.029043301939964294, -0.03137688338756561, -0.0516120120882988, -0.033414166420698166, 0.05385608226060867, -0.025596346706151962, -0.02077491395175457, -0.0634346529841423, 0.03223349153995514, 0.02784794755280018, -0.06079091876745224, -0.012161108665168285, -0.0933445394039154, -0.018985357135534286, -0.022000310942530632, 0.08059032261371613, 0.03905639797449112, 0.008981743827462196, -0.04856802150607109, -0.0195226538926363, -0.016003113240003586, -0.10165907442569733, -0.004733760375529528, 0.030122995376586914, -0.038355227559804916, 0.03839924931526184, -0.028533125296235085, 0.01822500303387642, 0.0707336813211441, -0.02592848241329193, 0.02241864986717701, 0.022557010874152184, 0.007257631979882717, 0.03511698544025421, 0.008497730828821659, 0.06233576685190201, 0.06869452446699142, 0.06520985811948776, -0.018009020015597343, 0.008016299456357956, -0.09440284222364426, -0.06914988905191422, -0.016991959884762764, -0.004849573597311974, 0.015289856120944023, -0.05368100106716156, -0.07648778706789017, 0.04355047643184662, -0.013986689038574696, 0.03536888584494591, 0.03178128972649574, 0.03904074802994728, 0.027542345225811005, 0.021311746910214424, -0.08981165289878845, 0.050620175898075104, 0.006543598137795925, 0.07310184836387634, -0.033499374985694885, -0.01851048693060875, -0.07171830534934998, -0.07049573212862015, -0.02946554869413376, 0.04081925004720688, -0.015752671286463737, -0.05440584942698479, -0.00638421019539237, -0.027693038806319237, -0.015809008851647377, -0.0794110968708992, 0.08307767659425735, -0.010127314366400242, 0.031197702512145042, -0.0325561985373497, 0.028586456552147865, 0.05326930806040764, -0.04397851228713989, -0.06359461694955826, 0.003676487598568201, 0.06998850405216217, -0.02999182790517807, 0.03461074084043503, 0.05651488155126572, -0.05784572660923004, 0.02231559529900551, -0.07732831686735153, -0.029416916891932487, 1.8518434945716996e-33, 0.0358523465692997, -0.002374001545831561, 0.009263500571250916, -0.05580880120396614, 0.030508413910865784, -0.037797845900058746, 0.01508091390132904, 0.02779262885451317, -0.04756521061062813, 0.010429342277348042, -0.005697719287127256, 0.03368696570396423, -0.014907917007803917, -0.02615354210138321, -0.05337945744395256, -0.08737822622060776, 0.04612358659505844, 0.016435381025075912, -0.03597096726298332, -0.06492944061756134, 0.11139646172523499, -0.04470240697264671, 0.013333962298929691, 0.06944458186626434, 0.04924115538597107, 0.021988168358802795, -0.0033458129037171602, -0.021327221766114235, 0.04618706554174423, 0.09092214703559875, -0.009819227270781994, 0.03574197739362717, -0.02589249238371849, 0.015359507873654366, 0.01923568733036518, 0.009884021244943142, -0.0687863752245903, 0.008688706904649734, 0.0003024878678843379, 0.006991893518716097, -0.07505182921886444, -0.045765507966279984, 0.005778071004897356, 0.0200499240309, -0.07049272209405899, -0.06168036535382271, 0.044801026582717896, 0.026470575481653214, 0.01803005486726761, 0.04355733096599579, 0.034672655165195465, -0.08011800795793533, 0.03965161740779877, 0.08112046867609024, 0.07237163931131363, 0.07554267346858978, -0.0966770201921463, 0.05703232064843178, 0.007653184700757265, 0.09404793381690979, 0.02874479629099369, 0.032439567148685455, -0.006544401869177818, 0.0747322142124176, -0.06779398024082184, -0.03769124671816826, 0.018574388697743416, -0.0027497054543346167, 0.05186106637120247, 0.045869190245866776, 0.052037931978702545, 0.00877095852047205, 0.00956355594098568, 0.06010708585381508, 0.07063814997673035, -0.05281956121325493, 0.11385682970285416, 0.0014734964352101088, -0.13000114262104034, 0.04160114377737045, 0.002756801201030612, -0.03354136645793915, -0.012316903099417686, -0.04667062684893608, -0.021649040281772614, 0.009122663177549839, 0.07305404543876648, 0.050488732755184174, 0.0037498027086257935, 0.06742933392524719, -0.09808871150016785, -0.02533995360136032, 0.07752660661935806, -0.008930775336921215, -0.020734407007694244, -8.718873943854186e-34, 0.030775681138038635, -0.04046367108821869, -0.07485030591487885, 0.06837300956249237, 0.03777360916137695, 0.03171695023775101, 0.038366734981536865, -0.009698187932372093, -0.06721752882003784, 0.03483430668711662, -0.03264770656824112, -0.004821446258574724, 0.017873667180538177, -0.01217806525528431, -0.06693356484174728, -0.042935941368341446, 0.07182027399539948, -0.023592444136738777, 0.010779321193695068, 0.03270953893661499, -0.03838556632399559, -0.010096886195242405, -0.058566078543663025, -0.06304068863391876, -0.013382021337747574, -0.011351224966347218, -0.08517401665449142, 0.007304960861802101, -0.04197632893919945, -0.008837309665977955, 0.000581165833864361, 0.009765408001840115, -0.02323746308684349, -0.07040572166442871, -0.0630621388554573, -0.01030951738357544, 0.07319610565900803, -0.002567168092355132, -0.00982675701379776, 0.08009836822748184, 0.06278694421052933, -0.053986601531505585, -0.13036444783210754, -0.05632428079843521, -0.012127791531383991, -0.00034488266101107, -0.05524465814232826, -0.019998280331492424, -0.041557829827070236, 0.07457990199327469, -0.004864905495196581, 0.0744631364941597, -0.038698967546224594, 0.11076352000236511, 0.08321533352136612, -0.1319902539253235, 0.05189663544297218, -0.08637715131044388, -0.047119464725255966, 0.0712425485253334, 0.038989413529634476, -0.06715074181556702, 0.0770900622010231, -0.016237575560808182, 0.16853967308998108, -0.003975923638790846, 0.11307050287723541, 0.07726389169692993, -0.028748558834195137, 0.04492560029029846, 0.0768602192401886, 0.0852692499756813, 0.021246735006570816, 0.11719376593828201, 0.0029091970063745975, -0.011192459613084793, -0.09389575570821762, 0.021549541503190994, -0.0055024465546011925, 0.032183919101953506, 0.0651387944817543, -0.0652405172586441, 0.03021097555756569, 0.1095665693283081, -0.02563057281076908, 0.05070950835943222, 0.09074468910694122, 0.08164751529693604, 0.039858028292655945, -0.045717816799879074, -0.01968374475836754, -0.01942502148449421, 0.020252034068107605, 0.028495490550994873, -0.014108758419752121, -2.6071681702433125e-08, -0.004948799964040518, -0.03374723717570305, -0.006966953631490469, 0.04770921543240547, 0.060589514672756195, 0.039017271250486374, -0.06870992481708527, 0.04758283868432045, -0.04153140261769295, -0.009761914610862732, 0.05678777024149895, -0.024886248633265495, 0.08310353755950928, 0.04019981995224953, 0.04347654804587364, -0.016476230695843697, 0.02281028777360916, 0.044384729117155075, 0.012391419149935246, 0.03150279074907303, 0.03414358198642731, 0.023670021444559097, -0.035867370665073395, 0.00584121560677886, 0.03878429904580116, -0.03416749835014343, 0.0317315049469471, 0.014832393266260624, 0.06329585611820221, -0.07007385790348053, -0.11312873661518097, -0.0667077898979187, 0.031542230397462845, 0.03318323940038681, -0.05146196484565735, -0.04369741305708885, 0.030556850135326385, 0.05148332566022873, -0.09324397146701813, 0.08804989606142044, -0.05473781377077103, 0.02356131188571453, -0.0072563826106488705, -0.013308629393577576, 0.022258494049310684, 0.047823697328567505, -0.014027439057826996, -0.018331162631511688, -0.02744504064321518, 0.027262693271040916, -0.03694259002804756, 0.04492212459445, 0.04835069552063942, 0.09086570143699646, -0.0022586847189813852, -0.03940355032682419, -0.005774456076323986, -0.06551025062799454, -0.04700932279229164, -0.00200175354257226, -0.039275478571653366, -0.04998438432812691, -0.08698498457670212, 0.015872927382588387], 'path': 'embedding'}}, {'$set': {'score': {'$meta': 'vectorSearchScore'}}}]\n", - "VectorDB returns doc_ids: [[ObjectId('6682a624b9758026836ffab5'), ObjectId('6682a624b9758026836ffad5')]]\n", - "\u001b[32mNo more context, will terminate.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "TERMINATE\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "# reset the assistant. Always reset the assistant before starting a new conversation.\n", - "assistant.reset()\n", - "\n", - "# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.\n", - "# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.\n", - "# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.\n", - "# With human-in-loop, the conversation will continue until the user says \"exit\".\n", - "code_problem = \"How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\"\n", - "chat_result = ragproxyagent.initiate_chat(\n", - " assistant, message=ragproxyagent.message_generator, problem=code_problem, search_string=\"spark\"\n", - ") # search_string is used as an extra filter for the embeddings search, in this case, we only want to search documents that contain \"spark\"." - ] - } - ], - "metadata": { - "front_matter": { - "description": "Explore the use of AutoGen's RetrieveChat for tasks like code generation from docstrings, answering complex questions with human feedback, and exploiting features like Update Context, custom prompts, and few-shot learning.", - "tags": [ - "RAG" - ] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - }, - "skip_test": "Requires interactive usage" - }, - "nbformat": 4, - "nbformat_minor": 4 -} From de12cd1e061e506a2706f9e18ebfc814c2b47df5 Mon Sep 17 00:00:00 2001 From: Jib Date: Tue, 23 Jul 2024 00:52:30 -0400 Subject: [PATCH 25/33] have index management include waiting behaviors --- autogen/agentchat/contrib/vectordb/mongodb.py | 13 +- .../contrib/vectordb/test_mongodb.py | 161 +++++++++++------- 2 files changed, 102 insertions(+), 72 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index c45653fd729a..d24afbba53bd 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -16,7 +16,6 @@ DEFAULT_INSERT_BATCH_SIZE = 100_000 _SAMPLE_SENTENCE = ["The weather is lovely today in paradise."] -_TIMEOUT = 20.0 _DELAY = 0.5 @@ -38,7 +37,7 @@ def __init__( collection_name: str = None, index_name: str = "vector_index", overwrite: bool = False, - wait_until_ready: bool = False, + wait_until_ready: float | None = None, ): """ Initialize the vector database. @@ -90,11 +89,11 @@ def _is_index_ready(self, collection: Collection, index_name: str): return True return False - def _wait_for_index(self, collection: Collection, index_name: str, timeout=_TIMEOUT): - """Waits up to 20 seconds for the index to be created to be ready, otherwise - throws a TimeoutError""" + def _wait_for_index(self, collection: Collection, index_name: str): + """Waits for the index to be created to be ready, otherwise + throws a TimeoutError. Timeout set on instantiation""" start = monotonic() - while monotonic() - start < timeout: + while monotonic() - start < self._wait_until_ready: if self._is_index_ready(collection, index_name): return sleep(_DELAY) @@ -185,6 +184,8 @@ def delete_collection(self, collection_name: str) -> None: Args: collection_name: str | The name of the collection. """ + for index in self.db[collection_name].list_search_indexes(): + self.db[collection_name].drop_search_index(index["name"]) return self.db[collection_name].drop() def create_vector_search_index( diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 4b7b222327b3..8dc20437eef2 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -1,8 +1,11 @@ import logging import os +import random from time import monotonic, sleep from typing import List +import pymongo.database +import pymongo.errors import pytest from autogen.agentchat.contrib.vectordb.base import Document @@ -18,6 +21,7 @@ logger.warning(f"skipping {__name__}. It requires one to pip install pymongo or the extra [retrievechat-mongodb]") pytest.skip(allow_module_level=True) +from pymongo import MongoClient from pymongo.collection import Collection logger = logging.getLogger(__name__) @@ -29,7 +33,7 @@ RETRIES = 10 DELAY = 2 -TIMEOUT = 20.0 +TIMEOUT = 60.0 def _wait_for_predicate(predicate, err, timeout=TIMEOUT, interval=DELAY): @@ -51,28 +55,46 @@ def _wait_for_predicate(predicate, err, timeout=TIMEOUT, interval=DELAY): sleep(DELAY) -def _delete_collections(database): - """Delete all collections within the database +def _delete_search_indexes(collection: Collection, wait=False): + """Deletes all indexes in a collection + + Args: + collection (pymongo.Collection): MongoDB Collection Abstraction + """ + for index in collection.list_search_indexes(): + try: + collection.drop_search_index(index["name"]) + except pymongo.errors.OperationFailure: + # Delete already issued + pass + if wait: + _wait_for_predicate(lambda: not list(collection.list_search_indexes()), "Not all collections deleted") + + +def _empty_collections_and_delete_indexes(database, collections=None, wait=False): + """Empty all collections within the database and remove indexes Args: database (pymongo.Database): MongoDB Database Abstraction """ - for collection_name in database.list_collection_names(): - database[collection_name].drop() - _wait_for_predicate(lambda: not database.list_collection_names(), "Not all collections deleted") + for collection_name in collections or database.list_collection_names(): + _delete_search_indexes(database[collection_name], wait) + database[collection_name].delete_many({}) @pytest.fixture def db(): """VectorDB setup and teardown, including collections and search indexes""" + database = MongoClient(MONGODB_URI)[MONGODB_DATABASE] + _empty_collections_and_delete_indexes(database) vectorstore = MongoDBAtlasVectorDB( connection_string=MONGODB_URI, database_name=MONGODB_DATABASE, - wait_until_ready=True, + wait_until_ready=TIMEOUT, overwrite=True, ) yield vectorstore - _delete_collections(vectorstore.db) + _empty_collections_and_delete_indexes(database) @pytest.fixture @@ -87,20 +109,35 @@ def example_documents() -> List[Document]: @pytest.fixture -def db_with_indexed_clxn(): +def db_with_indexed_clxn(collection_name): """VectorDB with a collection created immediately""" + database = MongoClient(MONGODB_URI)[MONGODB_DATABASE] + _empty_collections_and_delete_indexes(database, [collection_name], wait=True) vectorstore = MongoDBAtlasVectorDB( connection_string=MONGODB_URI, database_name=MONGODB_DATABASE, - wait_until_ready=True, - collection_name=MONGODB_COLLECTION, + wait_until_ready=TIMEOUT, + collection_name=collection_name, overwrite=True, ) - yield vectorstore, vectorstore.db[MONGODB_COLLECTION] - _delete_collections(vectorstore.db) + yield vectorstore, vectorstore.db[collection_name] + _empty_collections_and_delete_indexes(database, [collection_name]) + + +_COLLECTION_NAMING_CACHE = [] + + +@pytest.fixture +def collection_name(): + collection_id = random.randint(0, 100) + while collection_id in _COLLECTION_NAMING_CACHE: + collection_id = random.randint(0, 100) + _COLLECTION_NAMING_CACHE.append(collection_id) + + return f"{MONGODB_COLLECTION}_{collection_id}" -def test_create_collection(db): +def test_create_collection(db, collection_name): """ def create_collection(collection_name: str, overwrite: bool = False) -> Collection @@ -110,8 +147,6 @@ def create_collection(collection_name: str, - Case 3. the collection exists and overwrite is False return the existing collection. - Case 4. the collection exists and overwrite is False and get_or_create is False, raise a ValueError """ - collection_name = "test_collection" - collection_case_1 = db.create_collection( collection_name=collection_name, ) @@ -132,9 +167,7 @@ def create_collection(collection_name: str, db.create_collection(collection_name=collection_name, overwrite=False, get_or_create=False) -def test_get_collection(db): - collection_name = MONGODB_COLLECTION - +def test_get_collection(db, collection_name): with pytest.raises(ValueError): db.get_collection() @@ -147,29 +180,29 @@ def test_get_collection(db): assert collection_got.name == db.active_collection.name -def test_delete_collection(db): - assert MONGODB_COLLECTION not in db.list_collections() - collection = db.create_collection(MONGODB_COLLECTION) - assert MONGODB_COLLECTION in db.list_collections() +def test_delete_collection(db, collection_name): + assert collection_name not in db.list_collections() + collection = db.create_collection(collection_name) + assert collection_name in db.list_collections() db.delete_collection(collection.name) - assert MONGODB_COLLECTION not in db.list_collections() + assert collection_name not in db.list_collections() -def test_insert_docs(db, example_documents): +def test_insert_docs(db, collection_name, example_documents): # Test that there's an active collection with pytest.raises(ValueError) as exc: db.insert_docs(example_documents) assert "No collection is specified" in str(exc.value) # Test upsert - db.insert_docs(example_documents, MONGODB_COLLECTION, upsert=True) + db.insert_docs(example_documents, collection_name, upsert=True) # Create a collection - db.delete_collection(MONGODB_COLLECTION) - collection = db.create_collection(MONGODB_COLLECTION) + db.delete_collection(collection_name) + collection = db.create_collection(collection_name) # Insert example documents - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db.insert_docs(example_documents, collection_name=collection_name) found = list(collection.find({})) assert len(found) == len(example_documents) # Check that documents have correct fields, including "_id" and "embedding" but not "id" @@ -183,7 +216,7 @@ def test_insert_docs(db, example_documents): def test_update_docs(db_with_indexed_clxn, example_documents): db, collection = db_with_indexed_clxn # Use update_docs to insert new documents - db.update_docs(example_documents, MONGODB_COLLECTION, upsert=True) + db.update_docs(example_documents, collection.name, upsert=True) # Test that no changes were made to example_documents assert set(example_documents[0].keys()) == {"id", "content", "metadata"} assert collection.count_documents({}) == len(example_documents) @@ -195,13 +228,13 @@ def test_update_docs(db_with_indexed_clxn, example_documents): # Update an *existing* Document updated_doc = Document(id=1, content="Cats are tough.", metadata={"a": 10}) - db.update_docs([updated_doc], MONGODB_COLLECTION) + db.update_docs([updated_doc], collection.name) assert collection.find_one({"_id": 1})["content"] == "Cats are tough." # Upsert a *new* Document new_id = 3 new_doc = Document(id=new_id, content="Cats are tough.") - db.update_docs([new_doc], MONGODB_COLLECTION, upsert=True) + db.update_docs([new_doc], collection.name, upsert=True) assert collection.find_one({"_id": new_id})["content"] == "Cats are tough." # Attempting to use update to insert a new doc @@ -209,55 +242,55 @@ def test_update_docs(db_with_indexed_clxn, example_documents): # is a no-op in MongoDB. # TODO Confirm behaviour and autogen's preference. new_id = 4 new_doc = Document(id=new_id, content="That is NOT a sandwich?") - db.update_docs([new_doc], MONGODB_COLLECTION) + db.update_docs([new_doc], collection.name) assert collection.find_one({"_id": new_id}) is None def test_delete_docs(db_with_indexed_clxn, example_documents): db, clxn = db_with_indexed_clxn # Insert example documents - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db.insert_docs(example_documents, collection_name=clxn.name) # Delete the 1s - db.delete_docs(ids=[1, "1"], collection_name=MONGODB_COLLECTION) + db.delete_docs(ids=[1, "1"], collection_name=clxn.name) # Confirm just the 2s remain assert {2, "2"} == {doc["_id"] for doc in clxn.find({})} def test_get_docs_by_ids(db_with_indexed_clxn, example_documents): - db, _ = db_with_indexed_clxn + db, clxn = db_with_indexed_clxn # Insert example documents - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db.insert_docs(example_documents, collection_name=clxn.name) # Test without setting "include" kwarg - docs = db.get_docs_by_ids(ids=[2, "2"], collection_name=MONGODB_COLLECTION) + docs = db.get_docs_by_ids(ids=[2, "2"], collection_name=clxn.name) assert len(docs) == 2 assert all([doc["id"] in [2, "2"] for doc in docs]) assert set(docs[0].keys()) == {"id", "content", "metadata"} # Test with include - docs = db.get_docs_by_ids(ids=[2], include=["content"], collection_name=MONGODB_COLLECTION) + docs = db.get_docs_by_ids(ids=[2], include=["content"], collection_name=clxn.name) assert len(docs) == 1 assert set(docs[0].keys()) == {"id", "content"} # Test with empty ids list - docs = db.get_docs_by_ids(ids=[], include=["content"], collection_name=MONGODB_COLLECTION) + docs = db.get_docs_by_ids(ids=[], include=["content"], collection_name=clxn.name) assert len(docs) == 0 # Test with empty ids list - docs = db.get_docs_by_ids(ids=None, include=["content"], collection_name=MONGODB_COLLECTION) + docs = db.get_docs_by_ids(ids=None, include=["content"], collection_name=clxn.name) assert len(docs) == 4 def test_retrieve_docs_empty(db_with_indexed_clxn): - db, _ = db_with_indexed_clxn - assert db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=2) == [] + db, clxn = db_with_indexed_clxn + assert db.retrieve_docs(queries=["Cats"], collection_name=clxn.name, n_results=2) == [] def test_retrieve_docs_populated_db_empty_query(db_with_indexed_clxn, example_documents): - db, _ = db_with_indexed_clxn - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db, clxn = db_with_indexed_clxn + db.insert_docs(example_documents, collection_name=clxn.name) # Empty list of queries returns empty list of results - results = db.retrieve_docs(queries=[], collection_name=MONGODB_COLLECTION, n_results=2) + results = db.retrieve_docs(queries=[], collection_name=clxn.name, n_results=2) assert results == [] @@ -267,19 +300,19 @@ def test_retrieve_docs(db_with_indexed_clxn, example_documents): We have the wait_until_ready flag to ensure index is created and ready Immediately adding documents and then querying is only standard for testing """ - db, _ = db_with_indexed_clxn + db, clxn = db_with_indexed_clxn # Insert example documents - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db.insert_docs(example_documents, collection_name=clxn.name) n_results = 2 # Number of closest docs to return def results_ready(): - results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) + results = db.retrieve_docs(queries=["Cats"], collection_name=clxn.name, n_results=n_results) return len(results[0]) == n_results _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") - results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) + results = db.retrieve_docs(queries=["Cats"], collection_name=clxn.name, n_results=n_results) assert {doc[0]["id"] for doc in results[0]} == {1, 2} assert all(["embedding" not in doc[0] for doc in results[0]]) @@ -290,40 +323,38 @@ def test_retrieve_docs_with_embedding(db_with_indexed_clxn, example_documents): We have the wait_until_ready flag to ensure index is created and ready Immediately adding documents and then querying is only standard for testing """ - db, _ = db_with_indexed_clxn + db, clxn = db_with_indexed_clxn # Insert example documents - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db.insert_docs(example_documents, collection_name=clxn.name) n_results = 2 # Number of closest docs to return def results_ready(): - results = db.retrieve_docs(queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results) + results = db.retrieve_docs(queries=["Cats"], collection_name=clxn.name, n_results=n_results) return len(results[0]) == n_results _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") - results = db.retrieve_docs( - queries=["Cats"], collection_name=MONGODB_COLLECTION, n_results=n_results, include_embedding=True - ) + results = db.retrieve_docs(queries=["Cats"], collection_name=clxn.name, n_results=n_results, include_embedding=True) assert {doc[0]["id"] for doc in results[0]} == {1, 2} assert all(["embedding" in doc[0] for doc in results[0]]) def test_retrieve_docs_multiple_queries(db_with_indexed_clxn, example_documents): - db, _ = db_with_indexed_clxn + db, clxn = db_with_indexed_clxn # Insert example documents - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db.insert_docs(example_documents, collection_name=clxn.name) n_results = 2 # Number of closest docs to return queries = ["Some good pets", "What kind of Sandwich?"] def results_ready(): - results = db.retrieve_docs(queries=queries, collection_name=MONGODB_COLLECTION, n_results=n_results) + results = db.retrieve_docs(queries=queries, collection_name=clxn.name, n_results=n_results) return all([len(res) == n_results for res in results]) _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") - results = db.retrieve_docs(queries=queries, collection_name=MONGODB_COLLECTION, n_results=2) + results = db.retrieve_docs(queries=queries, collection_name=clxn.name, n_results=2) assert len(results) == len(queries) assert all([len(res) == n_results for res in results]) @@ -332,23 +363,21 @@ def results_ready(): def test_retrieve_docs_with_threshold(db_with_indexed_clxn, example_documents): - db, _ = db_with_indexed_clxn + db, clxn = db_with_indexed_clxn # Insert example documents - db.insert_docs(example_documents, collection_name=MONGODB_COLLECTION) + db.insert_docs(example_documents, collection_name=clxn.name) n_results = 2 # Number of closest docs to return queries = ["Cats"] def results_ready(): - results = db.retrieve_docs(queries=queries, collection_name=MONGODB_COLLECTION, n_results=n_results) + results = db.retrieve_docs(queries=queries, collection_name=clxn.name, n_results=n_results) return len(results[0]) == n_results _wait_for_predicate(results_ready, f"Failed to retrieve docs after waiting {TIMEOUT} seconds after each.") # Distance Threshold of .3 means that the score must be .7 or greater # only one result should be that value - results = db.retrieve_docs( - queries=queries, collection_name=MONGODB_COLLECTION, n_results=n_results, distance_threshold=0.3 - ) + results = db.retrieve_docs(queries=queries, collection_name=clxn.name, n_results=n_results, distance_threshold=0.3) assert len(results[0]) == 1 assert all([doc[1] >= 0.7 for doc in results[0]]) From 5e00b2dee0fd9784abe751b028259d6ba863a435 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 23 Jul 2024 11:44:34 -0400 Subject: [PATCH 26/33] Adds further optional waits or users and tests. Cleans up upsert. --- autogen/agentchat/contrib/vectordb/mongodb.py | 42 ++++++++++--------- .../contrib/vectordb/test_mongodb.py | 8 ++-- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index d24afbba53bd..c92feeb9f83e 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -46,10 +46,10 @@ def __init__( connection_string: str | The MongoDB connection string to connect to. Default is ''. database_name: str | The name of the database. Default is 'vector_db'. embedding_function: The embedding function used to generate the vector representation. - overwrite: bool | Overwrite existing collection with new information from this object + overwrite: bool | Overwrite existing collection with new information from this object. defaults to False - wait_until_ready: bool | Blocking call to wait until the database indexes are READY - will timeout after 20 seconds. Defaults to False + wait_until_ready: float | None | Blocking call to wait until the + database indexes are ready. None, the default, means no wait. """ self.embedding_function = embedding_function self.index_name = index_name @@ -89,14 +89,21 @@ def _is_index_ready(self, collection: Collection, index_name: str): return True return False - def _wait_for_index(self, collection: Collection, index_name: str): - """Waits for the index to be created to be ready, otherwise - throws a TimeoutError. Timeout set on instantiation""" + def _wait_for_index(self, collection: Collection, index_name: str, action: str = "create"): + """Waits for the index action to be completed. Otherwise throws a TimeoutError. + + Timeout set on instantiation. + action: "create" or "delete" + """ + assert action in ["create", "delete"], f"{action=} must be create or delete." start = monotonic() while monotonic() - start < self._wait_until_ready: - if self._is_index_ready(collection, index_name): + if action == "create" and self._is_index_ready(collection, index_name): + return + elif action == "delete" and len(list(collection.list_search_indexes())) == 0: return sleep(_DELAY) + raise TimeoutError(f"Index {self.index_name} is not ready!") def _get_embedding_size(self): @@ -126,7 +133,7 @@ def create_collection( get_or_create: bool | Whether to get or create the collection. Default is True """ if overwrite: - self.db.drop_collection(collection_name) + self.delete_collection(collection_name) if collection_name not in self.db.list_collection_names(): # Create a new collection @@ -186,6 +193,8 @@ def delete_collection(self, collection_name: str) -> None: """ for index in self.db[collection_name].list_search_indexes(): self.db[collection_name].drop_search_index(index["name"]) + if self._wait_until_ready: + self._wait_for_index(self.db[collection_name], index["name"], "delete") return self.db[collection_name].drop() def create_vector_search_index( @@ -223,7 +232,7 @@ def create_vector_search_index( try: collection.create_search_index(model=search_index_model) if self._wait_until_ready: - self._wait_for_index(collection, index_name) + self._wait_for_index(collection, index_name, "create") logger.debug(f"Search index {index_name} created successfully.") except Exception as e: logger.error( @@ -234,13 +243,6 @@ def create_vector_search_index( ) raise e - def upsert_docs(self, docs, collection): - for doc in docs: - query = {"id": doc["id"]} - doc["embedding"] = np.array(self.embedding_function([doc["content"]])).tolist()[0] - new_values = {"$set": doc} - collection.update_one(query, new_values, upsert=True) - def insert_docs( self, docs: List[Document], @@ -265,7 +267,7 @@ def insert_docs( collection = self.get_collection(collection_name) if upsert: - self.upsert_docs(docs, collection) + self.update_docs(docs, collection.name, upsert=True) else: # Sanity checking the first document if docs[0].get("content") is None: @@ -341,7 +343,7 @@ def _insert_batch( ] # insert the documents in MongoDB Atlas insert_result = collection.insert_many(to_insert) # type: ignore - return insert_result.inserted_ids + return insert_result.inserted_ids # TODO Remove this. Replace by log like update_docs def update_docs(self, docs: List[Document], collection_name: str = None, **kwargs: Any) -> None: """Update documents, including their embeddings, in the Collection. @@ -457,11 +459,11 @@ def retrieve_docs( # Check status of index! if self._wait_until_ready: self._wait_for_index(collection, self.index_name) - logger.info(f"Using index: {self.index_name}") + logger.debug(f"Using index: {self.index_name}") results = [] for query_text in queries: # Compute embedding vector from semantic query - logger.info(f"Query: {query_text}") + logger.debug(f"Query: {query_text}") query_vector = np.array(self.embedding_function([query_text])).tolist()[0] # Find documents with similar vectors using the specified index query_result = _vector_search( diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 8dc20437eef2..b7583a7aa4b1 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -33,7 +33,7 @@ RETRIES = 10 DELAY = 2 -TIMEOUT = 60.0 +TIMEOUT = 120.0 def _wait_for_predicate(predicate, err, timeout=TIMEOUT, interval=DELAY): @@ -55,7 +55,7 @@ def _wait_for_predicate(predicate, err, timeout=TIMEOUT, interval=DELAY): sleep(DELAY) -def _delete_search_indexes(collection: Collection, wait=False): +def _delete_search_indexes(collection: Collection, wait=True): """Deletes all indexes in a collection Args: @@ -71,7 +71,7 @@ def _delete_search_indexes(collection: Collection, wait=False): _wait_for_predicate(lambda: not list(collection.list_search_indexes()), "Not all collections deleted") -def _empty_collections_and_delete_indexes(database, collections=None, wait=False): +def _empty_collections_and_delete_indexes(database, collections=None, wait=True): """Empty all collections within the database and remove indexes Args: @@ -79,7 +79,7 @@ def _empty_collections_and_delete_indexes(database, collections=None, wait=False """ for collection_name in collections or database.list_collection_names(): _delete_search_indexes(database[collection_name], wait) - database[collection_name].delete_many({}) + database[collection_name].drop() @pytest.fixture From 347fd0ebe08be552db4776a1c1d13bd2dab0ff71 Mon Sep 17 00:00:00 2001 From: Jib Date: Wed, 24 Jul 2024 14:21:26 -0400 Subject: [PATCH 27/33] ensure the embedding size for multiple embedding inputs is equal to dimensions --- autogen/agentchat/contrib/vectordb/mongodb.py | 4 ++-- test/agentchat/contrib/vectordb/test_mongodb.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index c92feeb9f83e..8f2d664bd563 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -361,12 +361,12 @@ def update_docs(self, docs: List[Document], collection_name: str = None, **kwarg n_docs = len(docs) logger.info(f"Preparing to embed and update {n_docs=}") # Compute the embeddings - embeddings = self.embedding_function([doc["content"] for doc in docs]).tolist() + embeddings: list[list[float]] = self.embedding_function([doc["content"] for doc in docs]).tolist() # Prepare the updates all_updates = [] for i in range(n_docs): doc = deepcopy(docs[i]) - doc["embedding"] = embeddings + doc["embedding"] = embeddings[i] doc["_id"] = doc.pop("id") all_updates.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=kwargs.get("upsert", False))) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index b7583a7aa4b1..8a76c04a6945 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -223,6 +223,7 @@ def test_update_docs(db_with_indexed_clxn, example_documents): found = list(collection.find({})) # Check that documents have correct fields, including "_id" and "embedding" but not "id" assert all([set(doc.keys()) == {"_id", "content", "metadata", "embedding"} for doc in found]) + assert all([len(doc["embedding"]) == db.dimensions for doc in found]) # Check ids assert {doc["_id"] for doc in found} == {1, "1", 2, "2"} From 5790e48825c871449495eaedc6a313ae93785038 Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 25 Jul 2024 04:24:36 -0400 Subject: [PATCH 28/33] fix up tests and add configuration to ensure documents and indexes are READY for querying --- autogen/agentchat/contrib/vectordb/mongodb.py | 41 ++++++++++++++----- notebook/agentchat_RetrieveChat_mongodb.ipynb | 17 +++++--- .../contrib/vectordb/test_mongodb.py | 28 ++++++++++--- 3 files changed, 65 insertions(+), 21 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 8f2d664bd563..751c3d19215c 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -37,7 +37,8 @@ def __init__( collection_name: str = None, index_name: str = "vector_index", overwrite: bool = False, - wait_until_ready: float | None = None, + wait_until_index_ready: float | None = None, + wait_until_document_ready: float | None = None, ): """ Initialize the vector database. @@ -48,13 +49,16 @@ def __init__( embedding_function: The embedding function used to generate the vector representation. overwrite: bool | Overwrite existing collection with new information from this object. defaults to False - wait_until_ready: float | None | Blocking call to wait until the + wait_until_index_ready: float | None | Blocking call to wait until the + database indexes are ready. None, the default, means no wait. + wait_until_document_ready: float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait. """ self.embedding_function = embedding_function self.index_name = index_name self.overwrite = overwrite - self._wait_until_ready = wait_until_ready + self._wait_until_index_ready = wait_until_index_ready + self._wait_until_document_ready = wait_until_document_ready # This will get the model dimension size by computing the embeddings dimensions self.dimensions = self._get_embedding_size() @@ -97,7 +101,7 @@ def _wait_for_index(self, collection: Collection, index_name: str, action: str = """ assert action in ["create", "delete"], f"{action=} must be create or delete." start = monotonic() - while monotonic() - start < self._wait_until_ready: + while monotonic() - start < self._wait_until_index_ready: if action == "create" and self._is_index_ready(collection, index_name): return elif action == "delete" and len(list(collection.list_search_indexes())) == 0: @@ -106,6 +110,21 @@ def _wait_for_index(self, collection: Collection, index_name: str, action: str = raise TimeoutError(f"Index {self.index_name} is not ready!") + def _wait_for_document(self, collection: Collection, index_name: str, doc: Document): + start = monotonic() + while monotonic() - start < self._wait_until_document_ready: + query_result = _vector_search( + embedding_vector=np.array(self.embedding_function(doc["content"])).tolist(), + n_results=1, + collection=collection, + index_name=index_name, + ) + if query_result and query_result[0][0]["_id"] == doc["id"]: + return + sleep(_DELAY) + + raise TimeoutError(f"Document {self.index_name} is not ready!") + def _get_embedding_size(self): return len(self.embedding_function(_SAMPLE_SENTENCE)[0]) @@ -193,7 +212,7 @@ def delete_collection(self, collection_name: str) -> None: """ for index in self.db[collection_name].list_search_indexes(): self.db[collection_name].drop_search_index(index["name"]) - if self._wait_until_ready: + if self._wait_until_index_ready: self._wait_for_index(self.db[collection_name], index["name"], "delete") return self.db[collection_name].drop() @@ -231,7 +250,7 @@ def create_vector_search_index( # Create the search index try: collection.create_search_index(model=search_index_model) - if self._wait_until_ready: + if self._wait_until_index_ready: self._wait_for_index(collection, index_name, "create") logger.debug(f"Search index {index_name} created successfully.") except Exception as e: @@ -311,6 +330,8 @@ def insert_docs( in_diff=input_ids.difference(result_ids), out_diff=result_ids.difference(input_ids) ) ) + if self._wait_until_document_ready and docs: + self._wait_for_document(collection, self.index_name, docs[-1]) def _insert_batch( self, collection: Collection, texts: List[str], metadatas: List[Mapping[str, Any]], ids: List[ItemID] @@ -374,6 +395,9 @@ def update_docs(self, docs: List[Document], collection_name: str = None, **kwarg collection = self.get_collection(collection_name) result = collection.bulk_write(all_updates) + if self._wait_until_document_ready and docs: + self._wait_for_document(collection, self.index_name, docs[-1]) + # Log a result summary logger.info( "Matched: %s, Modified: %s, Upserted: %s", @@ -441,8 +465,6 @@ def retrieve_docs( n_results: int | The number of relevant documents to return. Default is 10. distance_threshold: float | The threshold for the distance score, only distance smaller than it will be returned. Don't filter with it if < 0. Default is -1. - wait_until_ready: bool | Will not execute the retrieval operation until the specified vector index is - ready to be queried. Defaults is false. kwargs: Dict | Additional keyword arguments. Ones of importance follow: oversampling_factor: int | This times n_results is 'ef' in the HNSW algorithm. It determines the number of nearest neighbor candidates to consider during the search phase. @@ -456,9 +478,6 @@ def retrieve_docs( if collection.count_documents({}) == 0: return [] - # Check status of index! - if self._wait_until_ready: - self._wait_for_index(collection, self.index_name) logger.debug(f"Using index: {self.index_name}") results = [] for query_text in queries: diff --git a/notebook/agentchat_RetrieveChat_mongodb.ipynb b/notebook/agentchat_RetrieveChat_mongodb.ipynb index 9c128bffbeab..e2e268a94323 100644 --- a/notebook/agentchat_RetrieveChat_mongodb.ipynb +++ b/notebook/agentchat_RetrieveChat_mongodb.ipynb @@ -48,10 +48,15 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "models to use: ['gpt4-1106-preview', 'gpt-4o', 'gpt-35-turbo', 'gpt-35-turbo-0613']\n" + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: './OAI_CONFIG_LIST'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# Accepted file formats for that can be stored in\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# a vector database instance\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mautogen\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mretrieve_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TEXT_FORMATS\n\u001b[0;32m---> 12\u001b[0m config_list \u001b[38;5;241m=\u001b[39m \u001b[43mautogen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig_list_from_json\u001b[49m\u001b[43m(\u001b[49m\u001b[43menv_or_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mOAI_CONFIG_LIST\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(config_list) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels to use: \u001b[39m\u001b[38;5;124m\"\u001b[39m, [config_list[i][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(config_list))])\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/autogen/lib/python3.11/site-packages/autogen/oai/openai_utils.py:527\u001b[0m, in \u001b[0;36mconfig_list_from_json\u001b[0;34m(env_or_file, file_location, filter_dict)\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 525\u001b[0m config_list_path \u001b[38;5;241m=\u001b[39m env_or_file\n\u001b[0;32m--> 527\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig_list_path\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m json_file:\n\u001b[1;32m 528\u001b[0m config_list \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(json_file)\n\u001b[1;32m 529\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m filter_config(config_list, filter_dict)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './OAI_CONFIG_LIST'" ] } ], @@ -163,6 +168,8 @@ " \"connection_string\": \"\", # MongoDB Atlas connection string\n", " \"database_name\": \"test_db\", # MongoDB Atlas database\n", " \"index_name\": \"vector_index\",\n", + " \"wait_until_index_ready\": 120.0, # Setting to wait 120 seconds or until index is constructed before querying\n", + " \"wait_until_document_ready\": 120.0, # Setting to wait 120 seconds or until document is properly indexed after insertion/update\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", @@ -260,7 +267,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.11.9" }, "skip_test": "Requires interactive usage" }, diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 8a76c04a6945..33ae0e1cac33 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -4,7 +4,6 @@ from time import monotonic, sleep from typing import List -import pymongo.database import pymongo.errors import pytest @@ -90,7 +89,7 @@ def db(): vectorstore = MongoDBAtlasVectorDB( connection_string=MONGODB_URI, database_name=MONGODB_DATABASE, - wait_until_ready=TIMEOUT, + wait_until_index_ready=TIMEOUT, overwrite=True, ) yield vectorstore @@ -116,7 +115,7 @@ def db_with_indexed_clxn(collection_name): vectorstore = MongoDBAtlasVectorDB( connection_string=MONGODB_URI, database_name=MONGODB_DATABASE, - wait_until_ready=TIMEOUT, + wait_until_index_ready=TIMEOUT, collection_name=collection_name, overwrite=True, ) @@ -223,6 +222,7 @@ def test_update_docs(db_with_indexed_clxn, example_documents): found = list(collection.find({})) # Check that documents have correct fields, including "_id" and "embedding" but not "id" assert all([set(doc.keys()) == {"_id", "content", "metadata", "embedding"} for doc in found]) + assert all([isinstance(doc["embedding"][0], float) for doc in found]) assert all([len(doc["embedding"]) == db.dimensions for doc in found]) # Check ids assert {doc["_id"] for doc in found} == {1, "1", 2, "2"} @@ -298,7 +298,7 @@ def test_retrieve_docs_populated_db_empty_query(db_with_indexed_clxn, example_do def test_retrieve_docs(db_with_indexed_clxn, example_documents): """Begin testing Atlas Vector Search NOTE: Indexing may take some time, so we must be patient on the first query. - We have the wait_until_ready flag to ensure index is created and ready + We have the wait_until_index_ready flag to ensure index is created and ready Immediately adding documents and then querying is only standard for testing """ db, clxn = db_with_indexed_clxn @@ -321,7 +321,7 @@ def results_ready(): def test_retrieve_docs_with_embedding(db_with_indexed_clxn, example_documents): """Begin testing Atlas Vector Search NOTE: Indexing may take some time, so we must be patient on the first query. - We have the wait_until_ready flag to ensure index is created and ready + We have the wait_until_index_ready flag to ensure index is created and ready Immediately adding documents and then querying is only standard for testing """ db, clxn = db_with_indexed_clxn @@ -382,3 +382,21 @@ def results_ready(): results = db.retrieve_docs(queries=queries, collection_name=clxn.name, n_results=n_results, distance_threshold=0.3) assert len(results[0]) == 1 assert all([doc[1] >= 0.7 for doc in results[0]]) + + +def test_wait_until_document_ready(collection_name, example_documents): + database = MongoClient(MONGODB_URI)[MONGODB_DATABASE] + _empty_collections_and_delete_indexes(database, [collection_name], wait=True) + try: + vectorstore = MongoDBAtlasVectorDB( + connection_string=MONGODB_URI, + database_name=MONGODB_DATABASE, + wait_until_index_ready=TIMEOUT, + collection_name=collection_name, + overwrite=True, + wait_until_document_ready=TIMEOUT, + ) + vectorstore.insert_docs(example_documents) + assert vectorstore.retrieve_docs(queries=["Cats"], n_results=4) + finally: + _empty_collections_and_delete_indexes(database, [collection_name]) From 88040872a9cf0c0a4253c86799c66ecf448f0e25 Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 25 Jul 2024 10:20:09 -0400 Subject: [PATCH 29/33] fix import failure --- test/agentchat/contrib/vectordb/test_mongodb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/agentchat/contrib/vectordb/test_mongodb.py b/test/agentchat/contrib/vectordb/test_mongodb.py index 33ae0e1cac33..3ae1ed572591 100644 --- a/test/agentchat/contrib/vectordb/test_mongodb.py +++ b/test/agentchat/contrib/vectordb/test_mongodb.py @@ -4,7 +4,6 @@ from time import monotonic, sleep from typing import List -import pymongo.errors import pytest from autogen.agentchat.contrib.vectordb.base import Document @@ -22,6 +21,7 @@ from pymongo import MongoClient from pymongo.collection import Collection +from pymongo.errors import OperationFailure logger = logging.getLogger(__name__) @@ -63,7 +63,7 @@ def _delete_search_indexes(collection: Collection, wait=True): for index in collection.list_search_indexes(): try: collection.drop_search_index(index["name"]) - except pymongo.errors.OperationFailure: + except OperationFailure: # Delete already issued pass if wait: From ead65cae7ad2adda60fe9e780c1a5b44877cbf0d Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 25 Jul 2024 10:41:04 -0400 Subject: [PATCH 30/33] adjust typing for 3.9 --- autogen/agentchat/contrib/vectordb/mongodb.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/mongodb.py b/autogen/agentchat/contrib/vectordb/mongodb.py index 751c3d19215c..2e0580fe826b 100644 --- a/autogen/agentchat/contrib/vectordb/mongodb.py +++ b/autogen/agentchat/contrib/vectordb/mongodb.py @@ -37,8 +37,8 @@ def __init__( collection_name: str = None, index_name: str = "vector_index", overwrite: bool = False, - wait_until_index_ready: float | None = None, - wait_until_document_ready: float | None = None, + wait_until_index_ready: float = None, + wait_until_document_ready: float = None, ): """ Initialize the vector database. @@ -46,9 +46,11 @@ def __init__( Args: connection_string: str | The MongoDB connection string to connect to. Default is ''. database_name: str | The name of the database. Default is 'vector_db'. - embedding_function: The embedding function used to generate the vector representation. - overwrite: bool | Overwrite existing collection with new information from this object. - defaults to False + embedding_function: Callable | The embedding function used to generate the vector representation. + collection_name: str | The name of the collection to create for this vector database + Defaults to None + index_name: str | Index name for the vector database, defaults to 'vector_index' + overwrite: bool = False wait_until_index_ready: float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait. wait_until_document_ready: float | None | Blocking call to wait until the @@ -56,7 +58,6 @@ def __init__( """ self.embedding_function = embedding_function self.index_name = index_name - self.overwrite = overwrite self._wait_until_index_ready = wait_until_index_ready self._wait_until_document_ready = wait_until_document_ready @@ -73,7 +74,7 @@ def __init__( self.db = self.client[database_name] logger.debug(f"Atlas Database name: {self.db.name}") if collection_name: - self.active_collection = self.create_collection(collection_name, overwrite=self.overwrite) + self.active_collection = self.create_collection(collection_name, overwrite) else: self.active_collection = None From 892b81aa9347b6447ef3d8aac165c1033ce98743 Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 25 Jul 2024 13:43:06 -0400 Subject: [PATCH 31/33] fix up the notebook output --- notebook/agentchat_RetrieveChat_mongodb.ipynb | 370 ++++++++++++++++-- 1 file changed, 340 insertions(+), 30 deletions(-) diff --git a/notebook/agentchat_RetrieveChat_mongodb.ipynb b/notebook/agentchat_RetrieveChat_mongodb.ipynb index e2e268a94323..65a02e185c2a 100644 --- a/notebook/agentchat_RetrieveChat_mongodb.ipynb +++ b/notebook/agentchat_RetrieveChat_mongodb.ipynb @@ -44,19 +44,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: './OAI_CONFIG_LIST'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# Accepted file formats for that can be stored in\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# a vector database instance\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mautogen\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mretrieve_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TEXT_FORMATS\n\u001b[0;32m---> 12\u001b[0m config_list \u001b[38;5;241m=\u001b[39m \u001b[43mautogen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig_list_from_json\u001b[49m\u001b[43m(\u001b[49m\u001b[43menv_or_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mOAI_CONFIG_LIST\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(config_list) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels to use: \u001b[39m\u001b[38;5;124m\"\u001b[39m, [config_list[i][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(config_list))])\n", - "File \u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/autogen/lib/python3.11/site-packages/autogen/oai/openai_utils.py:527\u001b[0m, in \u001b[0;36mconfig_list_from_json\u001b[0;34m(env_or_file, file_location, filter_dict)\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 525\u001b[0m config_list_path \u001b[38;5;241m=\u001b[39m env_or_file\n\u001b[0;32m--> 527\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig_list_path\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m json_file:\n\u001b[1;32m 528\u001b[0m config_list \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(json_file)\n\u001b[1;32m 529\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m filter_config(config_list, filter_dict)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './OAI_CONFIG_LIST'" + "name": "stdout", + "output_type": "stream", + "text": [ + "models to use: ['gpt-3.5-turbo-0125']\n" ] } ], @@ -72,7 +67,7 @@ "# a vector database instance\n", "from autogen.retrieve_utils import TEXT_FORMATS\n", "\n", - "config_list = autogen.config_list_from_json(env_or_file=\"OAI_CONFIG_LIST\", file_location=\".\")\n", + "config_list = [{\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": \"\", \"api_type\": \"openai\"}]\n", "assert len(config_list) > 0\n", "print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])" ] @@ -95,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -114,18 +109,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/lijiang1/anaconda3/envs/pr2942/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from tqdm.autonotebook import tqdm, trange\n" - ] - } - ], + "outputs": [], "source": [ "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", "assistant = RetrieveAssistantAgent(\n", @@ -194,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -208,22 +194,346 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-07-21 21:50:16,476 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", - "2024-07-21 21:50:16,532 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Using index: vector_index\u001b[0m\n", - "2024-07-21 21:50:16,533 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\u001b[0m\n" + "2024-07-25 13:41:51,409 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "2024-07-25 13:41:51,413 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Preparing to embed and update n_docs=2\u001b[0m\n", + "2024-07-25 13:41:52,096 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Matched: 0, Modified: 0, Upserted: 2\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "VectorDB returns doc_ids: [[]]\n", - "\u001b[32mNo more context, will terminate.\u001b[0m\n", + "VectorDB returns doc_ids: [['bdfbc921', '7968cf3c']]\n", + "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n", + "\u001b[32mAdding content of doc 7968cf3c to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", - "TERMINATE\n", + "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", + "context provided by the user.\n", + "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", + "For code generation, you must obey the following rules:\n", + "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", + "Rule 2. You must follow the formats below to write your code:\n", + "```language\n", + "# your code\n", + "```\n", + "\n", + "User's question is: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "\n", + "Context is: # Integrate - Spark\n", + "\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", + "\n", + "## Spark ML Estimators\n", + "\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", + "\n", + "### Data\n", + "\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", + "\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "\n", + "Here is an example code snippet for Spark Data:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "\n", + "# Creating a dictionary\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000],\n", + "}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", + "\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "\n", + "Here is an example of how to use it:\n", + "\n", + "```python\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "```\n", + "\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", + "\n", + "### Estimators\n", + "\n", + "#### Model List\n", + "\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", + "\n", + "#### Usage\n", + "\n", + "First, prepare your data in the required format as described in the previous section.\n", + "\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", + "\n", + "Here is an example code snippet using SparkML models in AutoML:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", + "\n", + "## Parallel Spark Jobs\n", + "\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", + "\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", + "\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", + "\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "\n", + "An example code snippet for using parallel Spark jobs:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "# Research\n", + "\n", + "For technical details, please check our research publications.\n", + "\n", + "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", + "```\n", + "\n", + "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", + "```\n", + "\n", + "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", + "```\n", + "\n", + "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", + "```\n", + "\n", + "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", + "\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", + "}\n", + "```\n", + "\n", + "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", + "\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", + "```\n", + "\n", + "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", + "\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", + "```\n", + "\n", + "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", + "```\n", + "\n", + "\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "To use FLAML to perform a classification task and use Spark for parallel training with a timeout of 30 seconds and force canceling jobs if the time limit is reached, you can follow the below code snippet:\n", + "\n", + "```python\n", + "import flaml\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "# Prepare your data in pandas-on-spark format\n", + "data = {\n", + " \"feature1\": [val1, val2, val3, val4],\n", + " \"feature2\": [val5, val6, val7, val8],\n", + " \"target\": [class1, class2, class1, class2],\n", + "}\n", + "\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"target\"\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "\n", + "# Prepare your features using VectorAssembler\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf)\n", + "\n", + "# Define AutoML settings and fit the model\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"accuracy\",\n", + " \"task\": \"classification\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # Optional\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "In the code:\n", + "- Replace `val1, val2, ..., class1, class2` with your actual data values.\n", + "- Ensure the features and target columns are correctly specified in the data dictionary.\n", + "- Set the `time_budget` parameter to 30 to limit the training time.\n", + "- The `force_cancel` parameter is set to `True` to force cancel Spark jobs if the time limit is exceeded.\n", + "\n", + "Make sure to adapt the code to your specific dataset and requirements.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "UPDATE CONTEXT\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", + "VectorDB returns doc_ids: [['bdfbc921', '7968cf3c']]\n", + "VectorDB returns doc_ids: [['bdfbc921', '7968cf3c']]\n", + "VectorDB returns doc_ids: [['bdfbc921', '7968cf3c']]\n", + "VectorDB returns doc_ids: [['bdfbc921', '7968cf3c']]\n", + "\u001b[32mNo more context, will terminate.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "TERMINATE\n", From 2cca0c04cabb0266262e2b657e0a707179442bbd Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 25 Jul 2024 13:48:12 -0400 Subject: [PATCH 32/33] changed language to communicate time taken on first init_chat call --- notebook/agentchat_RetrieveChat_mongodb.ipynb | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/notebook/agentchat_RetrieveChat_mongodb.ipynb b/notebook/agentchat_RetrieveChat_mongodb.ipynb index 65a02e185c2a..14baf3a5e78c 100644 --- a/notebook/agentchat_RetrieveChat_mongodb.ipynb +++ b/notebook/agentchat_RetrieveChat_mongodb.ipynb @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -134,6 +134,7 @@ "# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.\n", "# In this example, we set it to [\"non-existent-type\"] to only process markdown files. Since no \"non-existent-type\" files are included in the `websit/docs`,\n", "# no files there will be processed. However, the explicitly included urls will still be processed.\n", + "# **NOTE** Upon the first time adding in the documents, initial query may be slower due to index creation and document indexing time\n", "ragproxyagent = RetrieveUserProxyAgent(\n", " name=\"ragproxyagent\",\n", " human_input_mode=\"NEVER\",\n", @@ -151,14 +152,14 @@ " \"vector_db\": \"mongodb\", # MongoDB Atlas database\n", " \"collection_name\": \"demo_collection\",\n", " \"db_config\": {\n", - " \"connection_string\": \"\", # MongoDB Atlas connection string\n", + " \"connection_string\": \"\", # MongoDB Atlas connection string\n", " \"database_name\": \"test_db\", # MongoDB Atlas database\n", " \"index_name\": \"vector_index\",\n", " \"wait_until_index_ready\": 120.0, # Setting to wait 120 seconds or until index is constructed before querying\n", " \"wait_until_document_ready\": 120.0, # Setting to wait 120 seconds or until document is properly indexed after insertion/update\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", - " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", + " \"overwrite\": False, # set to True if you want to overwrite an existing collection, each overwrite will force a index creation and reupload of documents\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", ")" @@ -180,9 +181,16 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-25 13:47:30,700 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `demo_collection`.\u001b[0m\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -194,9 +202,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-07-25 13:41:51,409 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", - "2024-07-25 13:41:51,413 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Preparing to embed and update n_docs=2\u001b[0m\n", - "2024-07-25 13:41:52,096 - autogen.agentchat.contrib.vectordb.mongodb - INFO - Matched: 0, Modified: 0, Upserted: 2\u001b[0m\n" + "2024-07-25 13:47:31,048 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "2024-07-25 13:47:31,051 - autogen.agentchat.contrib.vectordb.mongodb - INFO - No documents to insert.\u001b[0m\n" ] }, { From 7a44641a7e554920c1d9c7e700986c5407d137e5 Mon Sep 17 00:00:00 2001 From: Jib Date: Thu, 25 Jul 2024 16:39:22 -0400 Subject: [PATCH 33/33] replace environment variable usage --- notebook/agentchat_RetrieveChat_mongodb.ipynb | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/notebook/agentchat_RetrieveChat_mongodb.ipynb b/notebook/agentchat_RetrieveChat_mongodb.ipynb index 14baf3a5e78c..18494e28401d 100644 --- a/notebook/agentchat_RetrieveChat_mongodb.ipynb +++ b/notebook/agentchat_RetrieveChat_mongodb.ipynb @@ -37,9 +37,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Set your API Endpoint\n", - "\n", - "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n" + "## Set your API Endpoint\n" ] }, { @@ -67,7 +65,7 @@ "# a vector database instance\n", "from autogen.retrieve_utils import TEXT_FORMATS\n", "\n", - "config_list = [{\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": \"\", \"api_type\": \"openai\"}]\n", + "config_list = [{\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": os.environ[\"OPENAI_API_KEY\"], \"api_type\": \"openai\"}]\n", "assert len(config_list) > 0\n", "print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])" ] @@ -152,7 +150,7 @@ " \"vector_db\": \"mongodb\", # MongoDB Atlas database\n", " \"collection_name\": \"demo_collection\",\n", " \"db_config\": {\n", - " \"connection_string\": \"\", # MongoDB Atlas connection string\n", + " \"connection_string\": os.environ[\"MONGODB_URI\"], # MongoDB Atlas connection string\n", " \"database_name\": \"test_db\", # MongoDB Atlas database\n", " \"index_name\": \"vector_index\",\n", " \"wait_until_index_ready\": 120.0, # Setting to wait 120 seconds or until index is constructed before querying\n",