Skip to content

Commit fa49969

Browse files
ShobhitVishnoi30sonichisvrapidinnovationthinkallqingyun-wu
authored
Add additional docs in retrieval agent if required (microsoft#1028)
* Update conversable_agent.py * Add files via upload * Delete notebook/Async_human_input.ipynb * Add files via upload * refactor:formatter * feat:updated position * Update dbutils.py * added feature to add docs in retrieve * Update dbutils.py * Update retrieve_user_proxy_agent.py * Update retrieve_utils.py * Update qdrant_retrieve_user_proxy_agent.py * Update qdrant_retrieve_user_proxy_agent.py * feat:fixed pre commit issue --------- Co-authored-by: Chi Wang <[email protected]> Co-authored-by: svrapidinnovation <[email protected]> Co-authored-by: Li Jiang <[email protected]> Co-authored-by: Qingyun Wu <[email protected]>
1 parent 8169831 commit fa49969

File tree

3 files changed

+31
-5
lines changed

3 files changed

+31
-5
lines changed

autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ def __init__(
4747
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
4848
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
4949
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
50+
- extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite.,
51+
when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection..
52+
By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection.
5053
- collection_name (Optional, str): the name of the collection.
5154
If key not provided, a default name `autogen-docs` will be used.
5255
- model (Optional, str): the model to use for the retrieve chat.
@@ -116,6 +119,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
116119
custom_text_split_function=self.custom_text_split_function,
117120
custom_text_types=self._custom_text_types,
118121
recursive=self._recursive,
122+
extra_docs=self._extra_docs,
119123
parallel=self._parallel,
120124
on_disk=self._on_disk,
121125
quantization_config=self._quantization_config,
@@ -146,6 +150,7 @@ def create_qdrant_from_dir(
146150
custom_text_split_function: Callable = None,
147151
custom_text_types: List[str] = TEXT_FORMATS,
148152
recursive: bool = True,
153+
extra_docs: bool = False,
149154
parallel: int = 0,
150155
on_disk: bool = False,
151156
quantization_config: Optional[models.QuantizationConfig] = None,
@@ -169,6 +174,7 @@ def create_qdrant_from_dir(
169174
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
170175
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
171176
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
177+
extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
172178
parallel (Optional, int): How many parallel workers to use for embedding. Defaults to the number of CPU cores
173179
on_disk (Optional, bool): Whether to store the collection on disk. Default is False.
174180
quantization_config: Quantization configuration. If None, quantization will be disabled.
@@ -194,22 +200,32 @@ def create_qdrant_from_dir(
194200
)
195201
logger.info(f"Found {len(chunks)} chunks.")
196202

203+
collection = None
197204
# Check if collection by same name exists, if not, create it with custom options
198205
try:
199-
client.get_collection(collection_name=collection_name)
206+
collection = client.get_collection(collection_name=collection_name)
200207
except Exception:
201208
client.create_collection(
202209
collection_name=collection_name,
203210
vectors_config=client.get_fastembed_vector_params(
204211
on_disk=on_disk, quantization_config=quantization_config, hnsw_config=hnsw_config
205212
),
206213
)
207-
client.get_collection(collection_name=collection_name)
214+
collection = client.get_collection(collection_name=collection_name)
215+
216+
length = 0
217+
if extra_docs:
218+
length = len(collection.get()["ids"])
208219

209220
# Upsert in batch of 100 or less if the total number of chunks is less than 100
210221
for i in range(0, len(chunks), min(100, len(chunks))):
211222
end_idx = i + min(100, len(chunks) - i)
212-
client.add(collection_name, documents=chunks[i:end_idx], ids=[j for j in range(i, end_idx)], parallel=parallel)
223+
client.add(
224+
collection_name,
225+
documents=chunks[i:end_idx],
226+
ids=[(j + length) for j in range(i, end_idx)],
227+
parallel=parallel,
228+
)
213229

214230
# Create a payload index for the document field
215231
# Enables highly efficient payload filtering. Reference: https://qdrant.tech/documentation/concepts/indexing/#indexing

autogen/agentchat/contrib/retrieve_user_proxy_agent.py

+5
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ def __init__(
100100
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
101101
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
102102
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
103+
- extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite.,
104+
when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection..
105+
By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection.
103106
- collection_name (Optional, str): the name of the collection.
104107
If key not provided, a default name `autogen-docs` will be used.
105108
- model (Optional, str): the model to use for the retrieve chat.
@@ -171,6 +174,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
171174
self._task = self._retrieve_config.get("task", "default")
172175
self._client = self._retrieve_config.get("client", chromadb.Client())
173176
self._docs_path = self._retrieve_config.get("docs_path", None)
177+
self._extra_docs = self._retrieve_config.get("extra_docs", False)
174178
self._collection_name = self._retrieve_config.get("collection_name", "autogen-docs")
175179
if "docs_path" not in self._retrieve_config:
176180
logger.warning(
@@ -392,6 +396,7 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
392396
custom_text_split_function=self.custom_text_split_function,
393397
custom_text_types=self._custom_text_types,
394398
recursive=self._recursive,
399+
extra_docs=self._extra_docs,
395400
)
396401
self._collection = True
397402
self._get_or_create = True

autogen/retrieve_utils.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def create_vector_db_from_dir(
250250
custom_text_split_function: Callable = None,
251251
custom_text_types: List[str] = TEXT_FORMATS,
252252
recursive: bool = True,
253+
extra_docs: bool = False,
253254
) -> API:
254255
"""Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
255256
a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
@@ -274,7 +275,7 @@ def create_vector_db_from_dir(
274275
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
275276
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
276277
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
277-
278+
extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
278279
Returns:
279280
API: the chromadb client.
280281
"""
@@ -296,6 +297,10 @@ def create_vector_db_from_dir(
296297
metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine
297298
)
298299

300+
length = 0
301+
if extra_docs:
302+
length = len(collection.get()["ids"])
303+
299304
if custom_text_split_function is not None:
300305
chunks = split_files_to_chunks(
301306
get_files_from_dir(dir_path, custom_text_types, recursive),
@@ -314,7 +319,7 @@ def create_vector_db_from_dir(
314319
end_idx = i + min(40000, len(chunks) - i)
315320
collection.upsert(
316321
documents=chunks[i:end_idx],
317-
ids=[f"doc_{j}" for j in range(i, end_idx)], # unique for each doc
322+
ids=[f"doc_{j+length}" for j in range(i, end_idx)], # unique for each doc
318323
)
319324
except ValueError as e:
320325
logger.warning(f"{e}")

0 commit comments

Comments
 (0)