Add additional docs in retrieval agent if required (microsoft#1028)

ShobhitVishnoi30 · sonichi · svrapidinnovation · web-flow · commit fa49969c179d · 2023-12-25T21:25:03.000Z
* Update conversable_agent.py

* Add files via upload

* Delete notebook/Async_human_input.ipynb

* Add files via upload

* refactor:formatter

* feat:updated position

* Update dbutils.py

* added feature to add docs in retrieve

* Update dbutils.py

* Update retrieve_user_proxy_agent.py

* Update retrieve_utils.py

* Update qdrant_retrieve_user_proxy_agent.py

* Update qdrant_retrieve_user_proxy_agent.py

* feat:fixed pre commit issue

---------

Co-authored-by: Chi Wang &lt;wang.chi@microsoft.com&gt;
Co-authored-by: svrapidinnovation &lt;sv@rapidinnovation.dev&gt;
Co-authored-by: Li Jiang &lt;bnujli@gmail.com&gt;
Co-authored-by: Qingyun Wu &lt;qingyun.wu@psu.edu&gt;
diff --git a/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py
@@ -47,6 +47,9 @@ def __init__(
                     will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
                 - docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
                     the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
+                - extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite.,
+                    when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection..
+                    By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection.
                 - collection_name (Optional, str): the name of the collection.
                     If key not provided, a default name `autogen-docs` will be used.
                 - model (Optional, str): the model to use for the retrieve chat.
@@ -116,6 +119,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
                 custom_text_split_function=self.custom_text_split_function,
                 custom_text_types=self._custom_text_types,
                 recursive=self._recursive,
+                extra_docs=self._extra_docs,
                 parallel=self._parallel,
                 on_disk=self._on_disk,
                 quantization_config=self._quantization_config,
@@ -146,6 +150,7 @@ def create_qdrant_from_dir(
     custom_text_split_function: Callable = None,
     custom_text_types: List[str] = TEXT_FORMATS,
     recursive: bool = True,
+    extra_docs: bool = False,
     parallel: int = 0,
     on_disk: bool = False,
     quantization_config: Optional[models.QuantizationConfig] = None,
@@ -169,6 +174,7 @@ def create_qdrant_from_dir(
             Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
         custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
         recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
+        extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
         parallel (Optional, int): How many parallel workers to use for embedding. Defaults to the number of CPU cores
         on_disk (Optional, bool): Whether to store the collection on disk. Default is False.
         quantization_config: Quantization configuration. If None, quantization will be disabled.
@@ -194,22 +200,32 @@ def create_qdrant_from_dir(
         )
     logger.info(f"Found {len(chunks)} chunks.")
 
+    collection = None
     # Check if collection by same name exists, if not, create it with custom options
     try:
-        client.get_collection(collection_name=collection_name)
+        collection = client.get_collection(collection_name=collection_name)
     except Exception:
         client.create_collection(
             collection_name=collection_name,
             vectors_config=client.get_fastembed_vector_params(
                 on_disk=on_disk, quantization_config=quantization_config, hnsw_config=hnsw_config
             ),
         )
-        client.get_collection(collection_name=collection_name)
+        collection = client.get_collection(collection_name=collection_name)
+
+    length = 0
+    if extra_docs:
+        length = len(collection.get()["ids"])
 
     # Upsert in batch of 100 or less if the total number of chunks is less than 100
     for i in range(0, len(chunks), min(100, len(chunks))):
         end_idx = i + min(100, len(chunks) - i)
-        client.add(collection_name, documents=chunks[i:end_idx], ids=[j for j in range(i, end_idx)], parallel=parallel)
+        client.add(
+            collection_name,
+            documents=chunks[i:end_idx],
+            ids=[(j + length) for j in range(i, end_idx)],
+            parallel=parallel,
+        )
 
     # Create a payload index for the document field
     # Enables highly efficient payload filtering. Reference: https://qdrant.tech/documentation/concepts/indexing/#indexing
diff --git a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py
@@ -100,6 +100,9 @@ def __init__(
                     will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
                 - docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
                     the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
+                - extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite.,
+                    when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection..
+                    By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection.
                 - collection_name (Optional, str): the name of the collection.
                     If key not provided, a default name `autogen-docs` will be used.
                 - model (Optional, str): the model to use for the retrieve chat.
@@ -171,6 +174,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
         self._task = self._retrieve_config.get("task", "default")
         self._client = self._retrieve_config.get("client", chromadb.Client())
         self._docs_path = self._retrieve_config.get("docs_path", None)
+        self._extra_docs = self._retrieve_config.get("extra_docs", False)
         self._collection_name = self._retrieve_config.get("collection_name", "autogen-docs")
         if "docs_path" not in self._retrieve_config:
             logger.warning(
@@ -392,6 +396,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
                 custom_text_split_function=self.custom_text_split_function,
                 custom_text_types=self._custom_text_types,
                 recursive=self._recursive,
+                extra_docs=self._extra_docs,
             )
             self._collection = True
             self._get_or_create = True
diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py
@@ -250,6 +250,7 @@ def create_vector_db_from_dir(
     custom_text_split_function: Callable = None,
     custom_text_types: List[str] = TEXT_FORMATS,
     recursive: bool = True,
+    extra_docs: bool = False,
 ) -> API:
     """Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
         a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
@@ -274,7 +275,7 @@ def create_vector_db_from_dir(
             Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
         custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
         recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
-
+        extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
     Returns:
         API: the chromadb client.
     """
@@ -296,6 +297,10 @@ def create_vector_db_from_dir(
             metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32},  # ip, l2, cosine
         )
 
+        length = 0
+        if extra_docs:
+            length = len(collection.get()["ids"])
+
         if custom_text_split_function is not None:
             chunks = split_files_to_chunks(
                 get_files_from_dir(dir_path, custom_text_types, recursive),
@@ -314,7 +319,7 @@ def create_vector_db_from_dir(
             end_idx = i + min(40000, len(chunks) - i)
             collection.upsert(
                 documents=chunks[i:end_idx],
-                ids=[f"doc_{j}" for j in range(i, end_idx)],  # unique for each doc
+                ids=[f"doc_{j+length}" for j in range(i, end_idx)],  # unique for each doc
             )
     except ValueError as e:
         logger.warning(f"{e}")