-
Notifications
You must be signed in to change notification settings - Fork 15.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Community: Adding bulk_size as a setable param for OpenSearchVectorSearch #28325
Changes from all commits
51843d4
9fbccec
11383f4
6c3bd4a
0ee5af5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -402,6 +402,7 @@ def __init__( | |
self.client = _get_opensearch_client(opensearch_url, **kwargs) | ||
self.async_client = _get_async_opensearch_client(opensearch_url, **kwargs) | ||
self.engine = kwargs.get("engine", "nmslib") | ||
self.bulk_size = kwargs.get("bulk_size", 500) | ||
|
||
@property | ||
def embeddings(self) -> Embeddings: | ||
|
@@ -413,9 +414,10 @@ def __add( | |
embeddings: List[List[float]], | ||
metadatas: Optional[List[dict]] = None, | ||
ids: Optional[List[str]] = None, | ||
bulk_size: int = 500, | ||
bulk_size: Optional[int] = None, | ||
**kwargs: Any, | ||
) -> List[str]: | ||
bulk_size = bulk_size if bulk_size is not None else self.bulk_size | ||
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size) | ||
index_name = kwargs.get("index_name", self.index_name) | ||
text_field = kwargs.get("text_field", "text") | ||
|
@@ -454,9 +456,10 @@ async def __aadd( | |
embeddings: List[List[float]], | ||
metadatas: Optional[List[dict]] = None, | ||
ids: Optional[List[str]] = None, | ||
bulk_size: int = 500, | ||
bulk_size: Optional[int] = None, | ||
**kwargs: Any, | ||
) -> List[str]: | ||
bulk_size = bulk_size if bulk_size is not None else self.bulk_size | ||
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size) | ||
index_name = kwargs.get("index_name", self.index_name) | ||
text_field = kwargs.get("text_field", "text") | ||
|
@@ -560,7 +563,7 @@ def add_texts( | |
texts: Iterable[str], | ||
metadatas: Optional[List[dict]] = None, | ||
ids: Optional[List[str]] = None, | ||
bulk_size: int = 500, | ||
bulk_size: Optional[int] = None, | ||
**kwargs: Any, | ||
) -> List[str]: | ||
"""Run more texts through the embeddings and add to the vectorstore. | ||
|
@@ -582,6 +585,7 @@ def add_texts( | |
to "text". | ||
""" | ||
embeddings = self.embedding_function.embed_documents(list(texts)) | ||
bulk_size = bulk_size if bulk_size is not None else self.bulk_size | ||
return self.__add( | ||
texts, | ||
embeddings, | ||
|
@@ -596,14 +600,15 @@ async def aadd_texts( | |
texts: Iterable[str], | ||
metadatas: Optional[List[dict]] = None, | ||
ids: Optional[List[str]] = None, | ||
bulk_size: int = 500, | ||
bulk_size: Optional[int] = None, | ||
**kwargs: Any, | ||
) -> List[str]: | ||
""" | ||
Asynchronously run more texts through the embeddings | ||
and add to the vectorstore. | ||
""" | ||
embeddings = await self.embedding_function.aembed_documents(list(texts)) | ||
bulk_size = bulk_size if bulk_size is not None else self.bulk_size | ||
return await self.__aadd( | ||
texts, | ||
embeddings, | ||
|
@@ -618,7 +623,7 @@ def add_embeddings( | |
text_embeddings: Iterable[Tuple[str, List[float]]], | ||
metadatas: Optional[List[dict]] = None, | ||
ids: Optional[List[str]] = None, | ||
bulk_size: int = 500, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. breaking change - can we keep this, and use the passed-in as an override? can still default to the |
||
bulk_size: Optional[int] = None, | ||
**kwargs: Any, | ||
) -> List[str]: | ||
"""Add the given texts and embeddings to the vectorstore. | ||
|
@@ -641,6 +646,7 @@ def add_embeddings( | |
to "text". | ||
""" | ||
texts, embeddings = zip(*text_embeddings) | ||
bulk_size = bulk_size if bulk_size is not None else self.bulk_size | ||
return self.__add( | ||
list(texts), | ||
list(embeddings), | ||
|
@@ -1085,7 +1091,7 @@ def from_texts( | |
texts: List[str], | ||
embedding: Embeddings, | ||
metadatas: Optional[List[dict]] = None, | ||
bulk_size: int = 500, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. break |
||
bulk_size: Optional[int] = None, | ||
ids: Optional[List[str]] = None, | ||
**kwargs: Any, | ||
) -> OpenSearchVectorSearch: | ||
|
@@ -1134,6 +1140,7 @@ def from_texts( | |
""" | ||
embeddings = embedding.embed_documents(texts) | ||
bulk_size = bulk_size if bulk_size is not None else cls.bulk_size | ||
return cls.from_embeddings( | ||
embeddings, | ||
texts, | ||
|
@@ -1150,7 +1157,7 @@ async def afrom_texts( | |
texts: List[str], | ||
embedding: Embeddings, | ||
metadatas: Optional[List[dict]] = None, | ||
bulk_size: int = 500, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. break |
||
bulk_size: Optional[int] = None, | ||
ids: Optional[List[str]] = None, | ||
**kwargs: Any, | ||
) -> OpenSearchVectorSearch: | ||
|
@@ -1199,6 +1206,7 @@ async def afrom_texts( | |
""" | ||
embeddings = await embedding.aembed_documents(texts) | ||
bulk_size = bulk_size if bulk_size is not None else cls.bulk_size | ||
return await cls.afrom_embeddings( | ||
embeddings, | ||
texts, | ||
|
@@ -1216,7 +1224,7 @@ def from_embeddings( | |
texts: List[str], | ||
embedding: Embeddings, | ||
metadatas: Optional[List[dict]] = None, | ||
bulk_size: int = 500, | ||
bulk_size: Optional[int] = None, | ||
ids: Optional[List[str]] = None, | ||
**kwargs: Any, | ||
) -> OpenSearchVectorSearch: | ||
|
@@ -1285,6 +1293,7 @@ def from_embeddings( | |
"max_chunk_bytes", | ||
"is_aoss", | ||
] | ||
bulk_size = bulk_size if bulk_size is not None else cls.bulk_size | ||
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size) | ||
dim = len(embeddings[0]) | ||
# Get the index name from either from kwargs or ENV Variable | ||
|
@@ -1346,7 +1355,7 @@ async def afrom_embeddings( | |
texts: List[str], | ||
embedding: Embeddings, | ||
metadatas: Optional[List[dict]] = None, | ||
bulk_size: int = 500, | ||
bulk_size: Optional[int] = None, | ||
ids: Optional[List[str]] = None, | ||
**kwargs: Any, | ||
) -> OpenSearchVectorSearch: | ||
|
@@ -1417,6 +1426,7 @@ async def afrom_embeddings( | |
"max_chunk_bytes", | ||
"is_aoss", | ||
] | ||
bulk_size = bulk_size if bulk_size is not None else cls.bulk_size | ||
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size) | ||
dim = len(embeddings[0]) | ||
# Get the index name from either from kwargs or ENV Variable | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
break