microsoft · sonichi · Nov 21, 2023 · Oct 31, 2023 · Oct 31, 2023 · Nov 1, 2023
diff --git a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py
@@ -6,7 +6,7 @@
     raise ImportError("Please install dependencies first. `pip install pyautogen[retrievechat]`")
 from autogen.agentchat.agent import Agent
 from autogen.agentchat import UserProxyAgent
-from autogen.retrieve_utils import create_vector_db_from_dir, query_vector_db
+from autogen.retrieve_utils import create_vector_db_from_dir, query_vector_db, TEXT_FORMATS
 from autogen.token_count_utils import count_token
 from autogen.code_utils import extract_code
 
@@ -129,6 +129,8 @@ def __init__(
                     Default is autogen.token_count_utils.count_token that uses tiktoken, which may not be accurate for non-OpenAI models.
                 - custom_text_split_function(Optional, Callable): a custom function to split a string into a list of strings.
                     Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
+                - custom_text_types(Optional, List[str]): a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.
+                - recursive(Optional, bool): whether to search documents recursively in the docs_path. Default is True.
             **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
 
         Example of overriding retrieve_docs:
@@ -183,6 +185,8 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
         )
         self.custom_token_count_function = self._retrieve_config.get("custom_token_count_function", count_token)
         self.custom_text_split_function = self._retrieve_config.get("custom_text_split_function", None)
+        self._custom_text_types = self._retrieve_config.get("custom_text_types", TEXT_FORMATS)
+        self._recursive = self._retrieve_config.get("recursive", True)
         self._context_max_tokens = self._max_tokens * 0.8
         self._collection = True if self._docs_path is None else False  # whether the collection is created
         self._ipython = get_ipython()
@@ -373,6 +377,8 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
                 get_or_create=self._get_or_create,
                 embedding_function=self._embedding_function,
                 custom_text_split_function=self.custom_text_split_function,
+                custom_text_types=self._custom_text_types,
+                recursive=self._recursive,
             )
             self._collection = True
             self._get_or_create = False

diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py
@@ -216,6 +216,8 @@ def create_vector_db_from_dir(
     embedding_model: str = "all-MiniLM-L6-v2",
     embedding_function: Callable = None,
     custom_text_split_function: Callable = None,
+    custom_text_types: List[str] = TEXT_FORMATS,
+    recursive: bool = True,
 ) -> API:
     """Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
         a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
@@ -236,6 +238,10 @@ def create_vector_db_from_dir(
         embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
             the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
             functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
+        custom_text_split_function(Optional, Callable): a custom function to split a string into a list of strings.
+            Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
+        custom_text_types(Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
+        recursive(Optional, bool): whether to search documents recursively in the dir_path. Default is True.
 
     Returns:
         API: the chromadb client.
@@ -260,11 +266,15 @@ def create_vector_db_from_dir(
 
         if custom_text_split_function is not None:
             chunks = split_files_to_chunks(
-                get_files_from_dir(dir_path), custom_text_split_function=custom_text_split_function
+                get_files_from_dir(dir_path, custom_text_types, recursive),
+                custom_text_split_function=custom_text_split_function,
             )
         else:
             chunks = split_files_to_chunks(
-                get_files_from_dir(dir_path), max_tokens, chunk_mode, must_break_at_empty_line
+                get_files_from_dir(dir_path, custom_text_types, recursive),
+                max_tokens,
+                chunk_mode,
+                must_break_at_empty_line,
             )
         logger.info(f"Found {len(chunks)} chunks.")
         # Upsert in batch of 40000 or less if the total number of chunks is less than 40000

diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py
@@ -154,6 +154,7 @@ def custom_text_split_function(text):
             collection_name="mytestcollection",
             custom_text_split_function=custom_text_split_function,
             get_or_create=True,
+            recursive=False,
         )
         results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
         assert (
@@ -163,7 +164,12 @@ def custom_text_split_function(text):
 
     def test_retrieve_utils(self):
         client = chromadb.PersistentClient(path="/tmp/chromadb")
-        create_vector_db_from_dir(dir_path="./website/docs", client=client, collection_name="autogen-docs")
+        create_vector_db_from_dir(
+            dir_path="./website/docs",
+            client=client,
+            collection_name="autogen-docs",
+            custom_text_types=["txt", "md", "rtf", "rst"],
+        )
         results = query_vector_db(
             query_texts=[
                 "How can I use AutoGen UserProxyAgent and AssistantAgent to do code generation?",