Skip to content

Commit

Permalink
refactor: Update ChromaComponent build method to allow duplicates in …
Browse files Browse the repository at this point in the history
…the Vector Store
  • Loading branch information
ogabrielluiz committed Jun 10, 2024
1 parent becdb49 commit 24e8da5
Showing 1 changed file with 30 additions and 22 deletions.
52 changes: 30 additions & 22 deletions src/backend/base/langflow/components/vectorstores/Chroma.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
from typing import List, Optional, Union

import chromadb
Expand All @@ -6,6 +7,7 @@
from langchain_core.embeddings import Embeddings
from langchain_core.retrievers import BaseRetriever
from langchain_core.vectorstores import VectorStore

from langflow.base.vectorstores.utils import chroma_collection_to_records
from langflow.custom import CustomComponent
from langflow.schema import Record
Expand Down Expand Up @@ -48,6 +50,11 @@ def build_config(self):
"display_name": "Server SSL Enabled",
"advanced": True,
},
"allow_duplicates": {
"display_name": "Allow Duplicates",
"advanced": True,
"info": "If false, will not add documents that are already in the Vector Store.",
},
}

def build(
Expand All @@ -61,6 +68,7 @@ def build(
chroma_server_host: Optional[str] = None,
chroma_server_http_port: Optional[int] = None,
chroma_server_grpc_port: Optional[int] = None,
allow_duplicates: bool = False,
) -> Union[VectorStore, BaseRetriever]:
"""
Builds the Vector Store or BaseRetriever object.
Expand All @@ -75,6 +83,7 @@ def build(
- chroma_server_host (Optional[str]): The host for the Chroma server.
- chroma_server_http_port (Optional[int]): The HTTP port for the Chroma server.
- chroma_server_grpc_port (Optional[int]): The gRPC port for the Chroma server.
- allow_duplicates (bool): Whether to allow duplicates in the Vector Store.
Returns:
- Union[VectorStore, BaseRetriever]: The Vector Store or BaseRetriever object.
Expand All @@ -93,35 +102,34 @@ def build(
)
client = chromadb.HttpClient(settings=chroma_settings)

# If documents, then we need to create a Chroma instance using .from_documents

# Check index_directory and expand it if it is a relative path
if index_directory is not None:
index_directory = self.resolve_path(index_directory)

chroma = Chroma(
persist_directory=index_directory,
client=client,
embedding_function=embedding,
collection_name=collection_name,
)
if allow_duplicates:
stored_records = []
else:
stored_records = chroma_collection_to_records(chroma.get())
_stored_documents_without_id = []
for record in deepcopy(stored_records):
del record.id
_stored_documents_without_id.append(record)
documents = []
for _input in inputs or []:
if isinstance(_input, Record):
documents.append(_input.to_lc_document())
if _input not in _stored_documents_without_id:
documents.append(_input.to_lc_document())
else:
documents.append(_input)
if documents is not None and embedding is not None:
if len(documents) == 0:
raise ValueError("If documents are provided, there must be at least one document.")
chroma = Chroma.from_documents(
documents=documents, # type: ignore
persist_directory=index_directory,
collection_name=collection_name,
embedding=embedding,
client=client,
)
else:
chroma = Chroma(
persist_directory=index_directory,
client=client,
embedding_function=embedding,
)
raise ValueError("Inputs must be a Record objects.")

if documents and embedding is not None:
chroma.add_documents(documents)

store = chroma.get()
self.status = chroma_collection_to_records(store)
self.status = stored_records
return chroma

0 comments on commit 24e8da5

Please sign in to comment.