diff --git a/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py index 88a2d318aa95..0e2e0d6c9946 100644 --- a/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py +++ b/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py @@ -47,6 +47,9 @@ def __init__( will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function. - docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file, the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created. + - extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite., + when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection.. + By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection. - collection_name (Optional, str): the name of the collection. If key not provided, a default name `autogen-docs` will be used. - model (Optional, str): the model to use for the retrieve chat. @@ -116,6 +119,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = custom_text_split_function=self.custom_text_split_function, custom_text_types=self._custom_text_types, recursive=self._recursive, + extra_docs=self._extra_docs, parallel=self._parallel, on_disk=self._on_disk, quantization_config=self._quantization_config, @@ -146,6 +150,7 @@ def create_qdrant_from_dir( custom_text_split_function: Callable = None, custom_text_types: List[str] = TEXT_FORMATS, recursive: bool = True, + extra_docs: bool = False, parallel: int = 0, on_disk: bool = False, quantization_config: Optional[models.QuantizationConfig] = None, @@ -169,6 +174,7 @@ def create_qdrant_from_dir( Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`. custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS. recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True. + extra_docs (Optional, bool): whether to add more documents in the collection. Default is False parallel (Optional, int): How many parallel workers to use for embedding. Defaults to the number of CPU cores on_disk (Optional, bool): Whether to store the collection on disk. Default is False. quantization_config: Quantization configuration. If None, quantization will be disabled. @@ -194,9 +200,10 @@ def create_qdrant_from_dir( ) logger.info(f"Found {len(chunks)} chunks.") + collection = None # Check if collection by same name exists, if not, create it with custom options try: - client.get_collection(collection_name=collection_name) + collection = client.get_collection(collection_name=collection_name) except Exception: client.create_collection( collection_name=collection_name, @@ -204,12 +211,21 @@ def create_qdrant_from_dir( on_disk=on_disk, quantization_config=quantization_config, hnsw_config=hnsw_config ), ) - client.get_collection(collection_name=collection_name) + collection = client.get_collection(collection_name=collection_name) + + length = 0 + if extra_docs: + length = len(collection.get()["ids"]) # Upsert in batch of 100 or less if the total number of chunks is less than 100 for i in range(0, len(chunks), min(100, len(chunks))): end_idx = i + min(100, len(chunks) - i) - client.add(collection_name, documents=chunks[i:end_idx], ids=[j for j in range(i, end_idx)], parallel=parallel) + client.add( + collection_name, + documents=chunks[i:end_idx], + ids=[(j + length) for j in range(i, end_idx)], + parallel=parallel, + ) # Create a payload index for the document field # Enables highly efficient payload filtering. Reference: https://qdrant.tech/documentation/concepts/indexing/#indexing diff --git a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py index 28879d315632..e07c2e578171 100644 --- a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py +++ b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py @@ -100,6 +100,9 @@ def __init__( will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function. - docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file, the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created. + - extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite., + when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection.. + By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection. - collection_name (Optional, str): the name of the collection. If key not provided, a default name `autogen-docs` will be used. - model (Optional, str): the model to use for the retrieve chat. @@ -171,6 +174,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = self._task = self._retrieve_config.get("task", "default") self._client = self._retrieve_config.get("client", chromadb.Client()) self._docs_path = self._retrieve_config.get("docs_path", None) + self._extra_docs = self._retrieve_config.get("extra_docs", False) self._collection_name = self._retrieve_config.get("collection_name", "autogen-docs") if "docs_path" not in self._retrieve_config: logger.warning( @@ -392,6 +396,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = custom_text_split_function=self.custom_text_split_function, custom_text_types=self._custom_text_types, recursive=self._recursive, + extra_docs=self._extra_docs, ) self._collection = True self._get_or_create = True diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index d65fabb440bb..31467b07cb41 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -250,6 +250,7 @@ def create_vector_db_from_dir( custom_text_split_function: Callable = None, custom_text_types: List[str] = TEXT_FORMATS, recursive: bool = True, + extra_docs: bool = False, ) -> API: """Create a vector db from all the files in a given directory, the directory can also be a single file or a url to a single file. We support chromadb compatible APIs to create the vector db, this function is not required if @@ -274,7 +275,7 @@ def create_vector_db_from_dir( Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`. custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS. recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True. - + extra_docs (Optional, bool): whether to add more documents in the collection. Default is False Returns: API: the chromadb client. """ @@ -296,6 +297,10 @@ def create_vector_db_from_dir( metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine ) + length = 0 + if extra_docs: + length = len(collection.get()["ids"]) + if custom_text_split_function is not None: chunks = split_files_to_chunks( get_files_from_dir(dir_path, custom_text_types, recursive), @@ -314,7 +319,7 @@ def create_vector_db_from_dir( end_idx = i + min(40000, len(chunks) - i) collection.upsert( documents=chunks[i:end_idx], - ids=[f"doc_{j}" for j in range(i, end_idx)], # unique for each doc + ids=[f"doc_{j+length}" for j in range(i, end_idx)], # unique for each doc ) except ValueError as e: logger.warning(f"{e}")