From b8d542a281347f3256829ed8688bf6c624915206 Mon Sep 17 00:00:00 2001 From: "Yang Qiao (from Dev Box)" Date: Mon, 14 Oct 2024 13:58:34 +0800 Subject: [PATCH 1/5] add diskann and update testcase --- .../vectorstores/azure_cosmos_db.py | 87 ++++ .../vectorstores/test_azure_cosmos_db.py | 489 ++++++++++++++++-- 2 files changed, 545 insertions(+), 31 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py index 92a450bd86f85..f5871a7b4933f 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py @@ -44,6 +44,8 @@ class CosmosDBVectorSearchType(str, Enum): """IVF vector index""" VECTOR_HNSW = "vector-hnsw" """HNSW vector index""" + VECTOR_DISKANN = "vector-diskann" + """DISKANN vector index""" logger = logging.getLogger(__name__) @@ -181,6 +183,8 @@ def create_index( kind: str = "vector-ivf", m: int = 16, ef_construction: int = 64, + maxDegree: int = 32, + lBuild: int = 50, ) -> dict[str, Any]: """Creates an index using the index name specified at instance construction @@ -215,6 +219,7 @@ def create_index( - vector-ivf - vector-hnsw: available as a preview feature only, to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features + - vector-diskann: available as a preview feature only num_lists: This integer is the number of clusters that the inverted file (IVF) index uses to group the vector data. We recommend that numLists is set to documentCount/1000 @@ -239,6 +244,12 @@ def create_index( better index quality and higher accuracy, but it will also increase the time required to build the index. ef_construction has to be at least 2 * m + maxDegree: Max number of neighbors. + Default value is 32, range from 20 to 2048. + Only vector-diskann search supports this for now. + lBuild: l value for index building. + Default value is 50, range from 10 to 500. + Only vector-diskann search supports this for now. Returns: An object describing the created index @@ -254,6 +265,10 @@ def create_index( create_index_commands = self._get_vector_index_hnsw( kind, m, ef_construction, similarity, dimensions ) + elif kind == CosmosDBVectorSearchType.VECTOR_DISKANN: + create_index_commands = self._get_vector_index_diskann( + kind, maxDegree, lBuild, similarity, dimensions + ) # retrieve the database object current_database = self._collection.database @@ -306,6 +321,27 @@ def _get_vector_index_hnsw( } return command + def _get_vector_index_diskann( + self, kind: str, max_degree: int, l_build: int, similarity: str, dimensions: int + ) -> Dict[str, Any]: + command = { + "createIndexes": self._collection.name, + "indexes": [ + { + "name": self._index_name, + "key": {self._embedding_key: "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": kind, + "maxDegree": max_degree, + "lBuild": l_build, + "similarity": similarity, + "dimensions": dimensions, + }, + } + ], + } + return command + def create_filter_index( self, property_to_filter: str, @@ -421,6 +457,7 @@ def _similarity_search_with_score( pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + l_search: int = 40, with_embedding: bool = False, ) -> List[Tuple[Document, float]]: """Returns a list of documents with their scores @@ -433,12 +470,16 @@ def _similarity_search_with_score( - vector-ivf - vector-hnsw: available as a preview feature only, to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features + - vector-diskann: available as a preview feature only ef_search: The size of the dynamic candidate list for search (40 by default). A higher value provides better recall at the cost of speed. score_threshold: (Optional[float], optional): Maximum vector distance between selected documents and the query vector. Defaults to None. Only vector-ivf search supports this for now. + l_search: l value for index searching. + Default value is 40, range from 10 to 10000. + Only vector-diskann search supports this. Returns: A list of documents closest to the query vector @@ -450,6 +491,10 @@ def _similarity_search_with_score( pipeline = self._get_pipeline_vector_hnsw( embeddings, k, ef_search, pre_filter ) + elif kind == CosmosDBVectorSearchType.VECTOR_DISKANN: + pipeline = self._get_pipeline_vector_diskann( + embeddings, k, l_search, pre_filter + ) cursor = self._collection.aggregate(pipeline) @@ -461,6 +506,9 @@ def _similarity_search_with_score( document_object_field = res.pop("document") text = document_object_field.pop(self._text_key) metadata = document_object_field.pop("metadata") + metadata["_id"] = document_object_field.pop( + "_id" + ) # '_id' is in new position if with_embedding: metadata[self._embedding_key] = document_object_field.pop( self._embedding_key @@ -527,6 +575,37 @@ def _get_pipeline_vector_hnsw( ] return pipeline + def _get_pipeline_vector_diskann( + self, + embeddings: List[float], + k: int = 4, + l_search: int = 40, + pre_filter: Optional[Dict] = None, + ) -> List[dict[str, Any]]: + params = { + "vector": embeddings, + "path": self._embedding_key, + "k": k, + "lSearch": l_search, + } + if pre_filter: + params["filter"] = pre_filter + + pipeline: List[dict[str, Any]] = [ + { + "$search": { + "cosmosSearch": params, + } + }, + { + "$project": { + "similarityScore": {"$meta": "searchScore"}, + "document": "$$ROOT", + } + }, + ] + return pipeline + def similarity_search_with_score( self, query: str, @@ -535,6 +614,7 @@ def similarity_search_with_score( pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + l_search: int = 40, with_embedding: bool = False, ) -> List[Tuple[Document, float]]: embeddings = self._embedding.embed_query(query) @@ -545,6 +625,7 @@ def similarity_search_with_score( pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + l_search=l_search, with_embedding=with_embedding, ) return docs @@ -557,6 +638,7 @@ def similarity_search( pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + l_search: int = 40, with_embedding: bool = False, **kwargs: Any, ) -> List[Document]: @@ -567,6 +649,7 @@ def similarity_search( pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + l_search=l_search, with_embedding=with_embedding, ) return [doc for doc, _ in docs_and_scores] @@ -581,6 +664,7 @@ def max_marginal_relevance_search_by_vector( pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + l_search: int = 40, with_embedding: bool = False, **kwargs: Any, ) -> List[Document]: @@ -593,6 +677,7 @@ def max_marginal_relevance_search_by_vector( pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + l_search=l_search, with_embedding=with_embedding, ) @@ -616,6 +701,7 @@ def max_marginal_relevance_search( pre_filter: Optional[Dict] = None, ef_search: int = 40, score_threshold: float = 0.0, + l_search: int = 40, with_embedding: bool = False, **kwargs: Any, ) -> List[Document]: @@ -631,6 +717,7 @@ def max_marginal_relevance_search( pre_filter=pre_filter, ef_search=ef_search, score_threshold=score_threshold, + l_search=l_search, with_embedding=with_embedding, ) return docs diff --git a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py index b76bba231a2b2..5c00d0e4b93ae 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py +++ b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py @@ -8,7 +8,7 @@ import pytest from langchain_core.documents import Document -from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.embeddings import AzureOpenAIEmbeddings from langchain_community.vectorstores.azure_cosmos_db import ( AzureCosmosDBVectorSearch, CosmosDBSimilarityType, @@ -24,6 +24,7 @@ INDEX_NAME = "langchain-test-index" INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw" +INDEX_NAME_VECTOR_DISKANN = "langchain-test-index-diskann" NAMESPACE = "langchain_test_db.langchain_test_collection" CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "") DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") @@ -36,6 +37,9 @@ ef_construction = 64 ef_search = 40 score_threshold = 0.1 +maxDegree = 50 +lBuild = 40 +lSearch = 100 application_name = "LANGCHAIN_PYTHON" @@ -53,8 +57,9 @@ def collection() -> Any: @pytest.fixture() def azure_openai_embeddings() -> Any: - openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings( - deployment=model_deployment, model=model_name, chunk_size=1 + openai_embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings( + model=model_name, + chunk_size=1, ) return openai_embeddings @@ -70,8 +75,12 @@ def azure_openai_embeddings() -> Any: class TestAzureCosmosDBVectorSearch: @classmethod def setup_class(cls) -> None: - if not os.getenv("OPENAI_API_KEY"): - raise ValueError("OPENAI_API_KEY environment variable is not set") + if not os.getenv("AZURE_OPENAI_API_KEY"): + raise ValueError("AZURE_OPENAI_API_KEY environment variable is not set") + if not os.getenv("AZURE_OPENAI_ENDPOINT"): + raise ValueError("AZURE_OPENAI_ENDPOINT environment variable is not set") + if not os.getenv("AZURE_OPENAI_API_VERSION"): + raise ValueError("AZURE_OPENAI_API_VERSION environment variable is not set") # insure the test collection is empty collection = prepare_collection() @@ -95,7 +104,7 @@ def cosmos_db_url(self) -> Union[str, Generator[str, None, None]]: return "805.555.1212" def test_from_documents_cosine_distance( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: """Test end to end construction and search.""" documents = [ @@ -135,7 +144,7 @@ def test_from_documents_cosine_distance( vectorstore.delete_index() def test_from_documents_inner_product( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: """Test end to end construction and search.""" documents = [ @@ -174,7 +183,7 @@ def test_from_documents_inner_product( vectorstore.delete_index() def test_from_texts_cosine_distance( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -208,7 +217,7 @@ def test_from_texts_cosine_distance( vectorstore.delete_index() def test_from_texts_with_metadatas_cosine_distance( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -246,7 +255,7 @@ def test_from_texts_with_metadatas_cosine_distance( vectorstore.delete_index() def test_from_texts_with_metadatas_delete_one( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -280,7 +289,6 @@ def test_from_texts_with_metadatas_delete_one( assert output assert output[0].page_content == "What is a sandwich?" assert output[0].metadata["c"] == 1 - first_document_id_object = output[0].metadata["_id"] first_document_id = str(first_document_id_object) @@ -300,7 +308,7 @@ def test_from_texts_with_metadatas_delete_one( vectorstore.delete_index() def test_from_texts_with_metadatas_delete_multiple( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -359,7 +367,7 @@ def test_from_texts_with_metadatas_delete_multiple( vectorstore.delete_index() def test_from_texts_with_metadatas_inner_product( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -397,7 +405,7 @@ def test_from_texts_with_metadatas_inner_product( vectorstore.delete_index() def test_from_texts_with_metadatas_euclidean_distance( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -435,7 +443,7 @@ def test_from_texts_with_metadatas_euclidean_distance( vectorstore.delete_index() def test_max_marginal_relevance_cosine_distance( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = ["foo", "foo", "fou", "foy"] vectorstore = AzureCosmosDBVectorSearch.from_texts( @@ -453,7 +461,12 @@ def test_max_marginal_relevance_cosine_distance( query = "foo" output = vectorstore.max_marginal_relevance_search( - query, k=10, kind=kind, lambda_mult=0.1, score_threshold=score_threshold + query, + k=10, + kind=kind, + lambda_mult=0.1, + score_threshold=score_threshold, + with_embedding=True, ) assert len(output) == len(texts) @@ -463,7 +476,7 @@ def test_max_marginal_relevance_cosine_distance( vectorstore.delete_index() def test_max_marginal_relevance_inner_product( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = ["foo", "foo", "fou", "foy"] vectorstore = AzureCosmosDBVectorSearch.from_texts( @@ -481,7 +494,12 @@ def test_max_marginal_relevance_inner_product( query = "foo" output = vectorstore.max_marginal_relevance_search( - query, k=10, kind=kind, lambda_mult=0.1, score_threshold=score_threshold + query, + k=10, + kind=kind, + lambda_mult=0.1, + score_threshold=score_threshold, + with_embedding=True, ) assert len(output) == len(texts) @@ -495,7 +513,7 @@ def test_max_marginal_relevance_inner_product( """ def test_from_documents_cosine_distance_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: """Test end to end construction and search.""" documents = [ @@ -539,7 +557,7 @@ def test_from_documents_cosine_distance_vector_hnsw( vectorstore.delete_index() def test_from_documents_inner_product_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: """Test end to end construction and search.""" documents = [ @@ -583,7 +601,7 @@ def test_from_documents_inner_product_vector_hnsw( vectorstore.delete_index() def test_from_texts_cosine_distance_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -622,7 +640,7 @@ def test_from_texts_cosine_distance_vector_hnsw( vectorstore.delete_index() def test_from_texts_with_metadatas_cosine_distance_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -665,7 +683,7 @@ def test_from_texts_with_metadatas_cosine_distance_vector_hnsw( vectorstore.delete_index() def test_from_texts_with_metadatas_delete_one_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -724,7 +742,7 @@ def test_from_texts_with_metadatas_delete_one_vector_hnsw( vectorstore.delete_index() def test_from_texts_with_metadatas_delete_multiple_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -788,7 +806,7 @@ def test_from_texts_with_metadatas_delete_multiple_vector_hnsw( vectorstore.delete_index() def test_from_texts_with_metadatas_inner_product_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = [ "Dogs are tough.", @@ -831,7 +849,7 @@ def test_from_texts_with_metadatas_inner_product_vector_hnsw( vectorstore.delete_index() def test_max_marginal_relevance_cosine_distance_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = ["foo", "foo", "fou", "foy"] vectorstore = AzureCosmosDBVectorSearch.from_texts( @@ -859,6 +877,7 @@ def test_max_marginal_relevance_cosine_distance_vector_hnsw( kind=CosmosDBVectorSearchType.VECTOR_HNSW, lambda_mult=0.1, score_threshold=score_threshold, + with_embedding=True, ) assert len(output) == len(texts) @@ -868,7 +887,7 @@ def test_max_marginal_relevance_cosine_distance_vector_hnsw( vectorstore.delete_index() def test_max_marginal_relevance_inner_product_vector_hnsw( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: texts = ["foo", "foo", "fou", "foy"] vectorstore = AzureCosmosDBVectorSearch.from_texts( @@ -896,6 +915,414 @@ def test_max_marginal_relevance_inner_product_vector_hnsw( kind=CosmosDBVectorSearchType.VECTOR_HNSW, lambda_mult=0.1, score_threshold=score_threshold, + with_embedding=True, + ) + + assert len(output) == len(texts) + assert output[0].page_content == "foo" + assert output[1].page_content != "foo" + + vectorstore.delete_index() + + """ + Test cases for the similarity algorithm using vector-diskann + """ + + def test_from_documents_cosine_distance_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + """Test end to end construction and search.""" + documents = [ + Document(page_content="Dogs are tough.", metadata={"a": 1}), + Document(page_content="Cats have fluff.", metadata={"b": 1}), + Document(page_content="What is a sandwich?", metadata={"c": 1}), + Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}), + ] + + vectorstore = AzureCosmosDBVectorSearch.from_documents( + documents, + azure_openai_embeddings, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + sleep(1) # waits for Cosmos DB to save contents to the collection + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + output = vectorstore.similarity_search( + "Sandwich", + k=1, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + + assert output + assert output[0].page_content == "What is a sandwich?" + assert output[0].metadata["c"] == 1 + + vectorstore.delete_index() + + def test_from_documents_inner_product_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + """Test end to end construction and search.""" + documents = [ + Document(page_content="Dogs are tough.", metadata={"a": 1}), + Document(page_content="Cats have fluff.", metadata={"b": 1}), + Document(page_content="What is a sandwich?", metadata={"c": 1}), + Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}), + ] + + vectorstore = AzureCosmosDBVectorSearch.from_documents( + documents, + azure_openai_embeddings, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + sleep(1) # waits for Cosmos DB to save contents to the collection + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + output = vectorstore.similarity_search( + "Sandwich", + k=1, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + + assert output + assert output[0].page_content == "What is a sandwich?" + assert output[0].metadata["c"] == 1 + + vectorstore.delete_index() + + def test_from_texts_cosine_distance_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "That fence is purple.", + ] + vectorstore = AzureCosmosDBVectorSearch.from_texts( + texts, + azure_openai_embeddings, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + output = vectorstore.similarity_search( + "Sandwich", + k=1, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + + assert output[0].page_content == "What is a sandwich?" + + vectorstore.delete_index() + + def test_from_texts_with_metadatas_cosine_distance_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "The fence is purple.", + ] + metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}] + vectorstore = AzureCosmosDBVectorSearch.from_texts( + texts, + azure_openai_embeddings, + metadatas=metadatas, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + output = vectorstore.similarity_search( + "Sandwich", + k=1, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + + assert output + assert output[0].page_content == "What is a sandwich?" + assert output[0].metadata["c"] == 1 + + vectorstore.delete_index() + + def test_from_texts_with_metadatas_delete_one_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "The fence is purple.", + ] + metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}] + vectorstore = AzureCosmosDBVectorSearch.from_texts( + texts, + azure_openai_embeddings, + metadatas=metadatas, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + output = vectorstore.similarity_search( + "Sandwich", + k=1, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + + assert output + assert output[0].page_content == "What is a sandwich?" + assert output[0].metadata["c"] == 1 + + first_document_id_object = output[0].metadata["_id"] + first_document_id = str(first_document_id_object) + + vectorstore.delete_document_by_id(first_document_id) + sleep(2) # waits for the index to be updated + + output2 = vectorstore.similarity_search( + "Sandwich", + k=1, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + assert output2 + assert output2[0].page_content != "What is a sandwich?" + + vectorstore.delete_index() + + def test_from_texts_with_metadatas_delete_multiple_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "The fence is purple.", + ] + metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}] + vectorstore = AzureCosmosDBVectorSearch.from_texts( + texts, + azure_openai_embeddings, + metadatas=metadatas, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + output = vectorstore.similarity_search( + "Sandwich", + k=5, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + + first_document_id = str(output[0].metadata["_id"]) + + second_document_id = str(output[1].metadata["_id"]) + + third_document_id = str(output[2].metadata["_id"]) + + document_ids = [first_document_id, second_document_id, third_document_id] + vectorstore.delete(document_ids) + sleep(2) # waits for the index to be updated + + output_2 = vectorstore.similarity_search( + "Sandwich", + k=5, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + assert output + assert output_2 + + assert len(output) == 4 # we should see all the four documents + assert ( + len(output_2) == 1 + ) # we should see only one document left after three have been deleted + + vectorstore.delete_index() + + def test_from_texts_with_metadatas_inner_product_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "The fence is purple.", + ] + metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}] + vectorstore = AzureCosmosDBVectorSearch.from_texts( + texts, + azure_openai_embeddings, + metadatas=metadatas, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + output = vectorstore.similarity_search( + "Sandwich", + k=1, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lSearch=lSearch, + ) + + assert output + assert output[0].page_content == "What is a sandwich?" + assert output[0].metadata["c"] == 1 + + vectorstore.delete_index() + + def test_max_marginal_relevance_cosine_distance_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + texts = ["foo", "foo", "fou", "foy"] + vectorstore = AzureCosmosDBVectorSearch.from_texts( + texts, + azure_openai_embeddings, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + + # Create the IVF index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + query = "foo" + output = vectorstore.max_marginal_relevance_search( + query, + k=10, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lambda_mult=0.1, + lSearch=lSearch, + with_embedding=True, + ) + + assert len(output) == len(texts) + assert output[0].page_content == "foo" + assert output[1].page_content != "foo" + + vectorstore.delete_index() + + def test_max_marginal_relevance_inner_product_vector_diskann( + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any + ) -> None: + texts = ["foo", "foo", "fou", "foy"] + vectorstore = AzureCosmosDBVectorSearch.from_texts( + texts, + azure_openai_embeddings, + collection=collection, + index_name=INDEX_NAME_VECTOR_DISKANN, + ) + + # Create the DiskANN index that will be leveraged later for vector search + vectorstore.create_index( + num_lists, + dimensions, + similarity_algorithm, + CosmosDBVectorSearchType.VECTOR_DISKANN, + maxDegree, + lBuild, + ) + sleep(2) # waits for the index to be set up + + query = "foo" + output = vectorstore.max_marginal_relevance_search( + query, + k=10, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + lambda_mult=0.1, + lSearch=lSearch, + with_embedding=True, ) assert len(output) == len(texts) @@ -906,7 +1333,7 @@ def test_max_marginal_relevance_inner_product_vector_hnsw( @staticmethod def invoke_delete_with_no_args( - azure_openai_embeddings: OpenAIEmbeddings, collection: Any + azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> Optional[bool]: vectorstore: AzureCosmosDBVectorSearch = ( AzureCosmosDBVectorSearch.from_connection_string( @@ -922,7 +1349,7 @@ def invoke_delete_with_no_args( @staticmethod def invoke_delete_by_id_with_no_args( - azure_openai_embeddings: OpenAIEmbeddings, collection: Any + azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: vectorstore: AzureCosmosDBVectorSearch = ( AzureCosmosDBVectorSearch.from_connection_string( @@ -937,14 +1364,14 @@ def invoke_delete_by_id_with_no_args( vectorstore.delete_document_by_id() def test_invalid_arguments_to_delete( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: with pytest.raises(ValueError) as exception_info: self.invoke_delete_with_no_args(azure_openai_embeddings, collection) assert str(exception_info.value) == "No document ids provided to delete." def test_no_arguments_to_delete_by_id( - self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any + self, azure_openai_embeddings: AzureOpenAIEmbeddings, collection: Any ) -> None: with pytest.raises(Exception) as exception_info: self.invoke_delete_by_id_with_no_args( From c2e76325ca398aea0586bb66b6a3976a023e747b Mon Sep 17 00:00:00 2001 From: "Yang Qiao (from Dev Box)" Date: Mon, 14 Oct 2024 14:40:01 +0800 Subject: [PATCH 2/5] typo --- .../vectorstores/azure_cosmos_db.py | 10 +- .../vectorstores/test_azure_cosmos_db.py | 99 +++++++++---------- 2 files changed, 50 insertions(+), 59 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py index f5871a7b4933f..79045c6774d1c 100644 --- a/libs/community/langchain_community/vectorstores/azure_cosmos_db.py +++ b/libs/community/langchain_community/vectorstores/azure_cosmos_db.py @@ -183,8 +183,8 @@ def create_index( kind: str = "vector-ivf", m: int = 16, ef_construction: int = 64, - maxDegree: int = 32, - lBuild: int = 50, + max_degree: int = 32, + l_build: int = 50, ) -> dict[str, Any]: """Creates an index using the index name specified at instance construction @@ -244,10 +244,10 @@ def create_index( better index quality and higher accuracy, but it will also increase the time required to build the index. ef_construction has to be at least 2 * m - maxDegree: Max number of neighbors. + max_degree: Max number of neighbors. Default value is 32, range from 20 to 2048. Only vector-diskann search supports this for now. - lBuild: l value for index building. + l_build: l value for index building. Default value is 50, range from 10 to 500. Only vector-diskann search supports this for now. Returns: @@ -267,7 +267,7 @@ def create_index( ) elif kind == CosmosDBVectorSearchType.VECTOR_DISKANN: create_index_commands = self._get_vector_index_diskann( - kind, maxDegree, lBuild, similarity, dimensions + kind, max_degree, l_build, similarity, dimensions ) # retrieve the database object diff --git a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py index 5c00d0e4b93ae..b75c920224a2f 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py +++ b/libs/community/tests/integration_tests/vectorstores/test_azure_cosmos_db.py @@ -949,12 +949,11 @@ def test_from_documents_cosine_distance_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -992,12 +991,11 @@ def test_from_documents_inner_product_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -1032,12 +1030,11 @@ def test_from_texts_cosine_distance_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -1072,12 +1069,11 @@ def test_from_texts_with_metadatas_cosine_distance_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -1114,12 +1110,11 @@ def test_from_texts_with_metadatas_delete_one_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -1171,12 +1166,11 @@ def test_from_texts_with_metadatas_delete_multiple_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -1233,12 +1227,11 @@ def test_from_texts_with_metadatas_inner_product_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -1268,12 +1261,11 @@ def test_max_marginal_relevance_cosine_distance_vector_diskann( # Create the IVF index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up @@ -1306,12 +1298,11 @@ def test_max_marginal_relevance_inner_product_vector_diskann( # Create the DiskANN index that will be leveraged later for vector search vectorstore.create_index( - num_lists, - dimensions, - similarity_algorithm, - CosmosDBVectorSearchType.VECTOR_DISKANN, - maxDegree, - lBuild, + dimensions=dimensions, + similarity=similarity_algorithm, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + max_degree=maxDegree, + l_build=lBuild, ) sleep(2) # waits for the index to be set up From 38739bb9102ff9081905f0d20a2bacbf4f74a1c9 Mon Sep 17 00:00:00 2001 From: "Yang Qiao (from Dev Box)" Date: Wed, 16 Oct 2024 16:26:17 +0800 Subject: [PATCH 3/5] update example notebook --- .../vectorstores/azure_cosmos_db.ipynb | 249 ++++++++++++++++-- 1 file changed, 221 insertions(+), 28 deletions(-) diff --git a/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb b/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb index f4d93afa0138d..886d6a28d8ae8 100644 --- a/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb +++ b/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb @@ -38,9 +38,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "\r\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\r\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -74,7 +71,7 @@ "id": "f2e66b097c6ce2e3", "metadata": {}, "source": [ - "We want to use `OpenAIEmbeddings` so we need to set up our Azure OpenAI API Key alongside other environment variables. " + "We want to use `AzureOpenAIEmbeddings` so we need to set up our Azure OpenAI API Key alongside other environment variables. " ] }, { @@ -90,15 +87,10 @@ "outputs": [], "source": [ "# Set up the OpenAI Environment Variables\n", - "os.environ[\"OPENAI_API_TYPE\"] = \"azure\"\n", - "os.environ[\"OPENAI_API_VERSION\"] = \"2023-05-15\"\n", - "os.environ[\"OPENAI_API_BASE\"] = (\n", - " \"YOUR_OPEN_AI_ENDPOINT\" # https://example.openai.azure.com/\n", - ")\n", - "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"\n", - "os.environ[\"OPENAI_EMBEDDINGS_DEPLOYMENT\"] = (\n", - " \"smart-agent-embedding-ada\" # the deployment name for the embedding model\n", - ")\n", + "\n", + "os.environ[\"AZURE_OPENAI_API_KEY\"] = \"YOUR_AZURE_OPENAI_API_KEY\"\n", + "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"YOUR_AZURE_OPENAI_ENDPOINT\"\n", + "os.environ[\"AZURE_OPENAI_API_VERSION\"] = \"2023-05-15\"\n", "os.environ[\"OPENAI_EMBEDDINGS_MODEL_NAME\"] = \"text-embedding-ada-002\" # the model name" ] }, @@ -130,7 +122,7 @@ " CosmosDBSimilarityType,\n", " CosmosDBVectorSearchType,\n", ")\n", - "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_openai import AzureOpenAIEmbeddings\n", "from langchain_text_splitters import CharacterTextSplitter\n", "\n", "SOURCE_FILE_NAME = \"../../how_to/state_of_the_union.txt\"\n", @@ -147,14 +139,90 @@ "model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n", "\n", "\n", - "openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(\n", - " deployment=model_deployment, model=model_name, chunk_size=1\n", + "openai_embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(\n", + " model=model_name, chunk_size=1\n", ")" ] }, { "cell_type": "code", "execution_count": 5, + "id": "f6c6ed80-7b91-4833-bab5-c9b2b5edcdec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(metadata={'source': '../../how_to/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "610d3faa-c4e7-41cb-8ad7-28673efe9ecf", + "metadata": {}, + "outputs": [], + "source": [ + "from pymongo import MongoClient\n", + "\n", + "# INDEX_NAME = \"izzy-test-index-2\"\n", + "# NAMESPACE = \"izzy_test_db.izzy_test_collection\"\n", + "# DB_NAME, COLLECTION_NAME = NAMESPACE.split(\".\")\n", + "\n", + "client: MongoClient = MongoClient(CONNECTION_STRING)\n", + "collection = client[DB_NAME][COLLECTION_NAME]\n", + "\n", + "model_deployment = os.getenv(\n", + " \"OPENAI_EMBEDDINGS_DEPLOYMENT\", \"smart-agent-embedding-ada\"\n", + ")\n", + "model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n", + "\n", + "vectorstore = AzureCosmosDBVectorSearch.from_documents(\n", + " docs,\n", + " openai_embeddings,\n", + " collection=collection,\n", + " index_name=INDEX_NAME,\n", + ")\n", + "\n", + "# Read more about these variables in detail here. https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search\n", + "num_lists = 100\n", + "dimensions = 1536\n", + "similarity_algorithm = CosmosDBSimilarityType.COS\n", + "kind = CosmosDBVectorSearchType.VECTOR_IVF\n", + "m = 16\n", + "ef_construction = 64\n", + "ef_search = 40\n", + "score_threshold = 0.1\n", + "\n", + "\n", + "# HNSW vectorstore\n", + "dimensions = 1536\n", + "similarity_algorithm = CosmosDBSimilarityType.COS\n", + "kind = CosmosDBVectorSearchType.VECTOR_HNSW\n", + "m = 16\n", + "ef_construction = 64\n", + "\n", + "vectorstore.create_index(\n", + " dimensions=dimensions,\n", + " similarity=similarity_algorithm,\n", + " kind=kind ,\n", + " m=m,\n", + " ef_construction=ef_construction,\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "39ae6058c2f7fdf1", "metadata": { "ExecuteTime": { @@ -166,14 +234,10 @@ { "data": { "text/plain": [ - "{'raw': {'defaultShard': {'numIndexesBefore': 1,\n", - " 'numIndexesAfter': 2,\n", - " 'createdCollectionAutomatically': False,\n", - " 'ok': 1}},\n", - " 'ok': 1}" + "'\\n# DiskANN vectorstore\\nmaxDegree = 40\\ndimensions = 1536\\nsimilarity_algorithm = CosmosDBSimilarityType.COS\\nkind = CosmosDBVectorSearchType.VECTOR_DISKANN\\nlBuild = 20\\n\\nvectorstore.create_index(\\n dimensions=dimensions,\\n similarity=similarity_algorithm,\\n kind=kind ,\\n max_degree=maxDegree,\\n l_build=lBuild,\\n )\\n\\n# -----------------------------------------------------------\\n\\n# HNSW vectorstore\\ndimensions = 1536\\nsimilarity_algorithm = CosmosDBSimilarityType.COS\\nkind = CosmosDBVectorSearchType.VECTOR_HNSW\\nm = 16\\nef_construction = 64\\n\\nvectorstore.create_index(\\n dimensions=dimensions,\\n similarity=similarity_algorithm,\\n kind=kind ,\\n m=m,\\n ef_construction=ef_construction,\\n )\\n'" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -212,12 +276,46 @@ "\n", "vectorstore.create_index(\n", " num_lists, dimensions, similarity_algorithm, kind, m, ef_construction\n", - ")" + ")\n", + "\n", + "'''\n", + "# DiskANN vectorstore\n", + "maxDegree = 40\n", + "dimensions = 1536\n", + "similarity_algorithm = CosmosDBSimilarityType.COS\n", + "kind = CosmosDBVectorSearchType.VECTOR_DISKANN\n", + "lBuild = 20\n", + "\n", + "vectorstore.create_index(\n", + " dimensions=dimensions,\n", + " similarity=similarity_algorithm,\n", + " kind=kind ,\n", + " max_degree=maxDegree,\n", + " l_build=lBuild,\n", + " )\n", + "\n", + "# -----------------------------------------------------------\n", + "\n", + "# HNSW vectorstore\n", + "dimensions = 1536\n", + "similarity_algorithm = CosmosDBSimilarityType.COS\n", + "kind = CosmosDBVectorSearchType.VECTOR_HNSW\n", + "m = 16\n", + "ef_construction = 64\n", + "\n", + "vectorstore.create_index(\n", + " dimensions=dimensions,\n", + " similarity=similarity_algorithm,\n", + " kind=kind ,\n", + " m=m,\n", + " ef_construction=ef_construction,\n", + " )\n", + "'''" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "32c68d3246adc21f", "metadata": { "ExecuteTime": { @@ -234,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "8feeeb4364efb204", "metadata": { "ExecuteTime": { @@ -271,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "3c218ab6f59301f7", "metadata": { "ExecuteTime": { @@ -308,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "fd67e4d92c9ab32f", "metadata": { "ExecuteTime": { @@ -352,10 +450,105 @@ "Azure Cosmos DB for MongoDB supports pre-filtering with $lt, $lte, $eq, $neq, $gte, $gt, $in, $nin, and $regex. To use this feature, enable \"filtering vector search\" in the \"Preview Features\" tab of your Azure Subscription. Learn more about preview features [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search#filtered-vector-search-preview)." ] }, + { + "cell_type": "code", + "execution_count": 29, + "id": "19c43de6-47f9-45f0-a422-8d852a5d191f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'raw': {'defaultShard': {'numIndexesBefore': 3,\n", + " 'numIndexesAfter': 4,\n", + " 'createdCollectionAutomatically': False,\n", + " 'ok': 1}},\n", + " 'ok': 1}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create a filter index\n", + "vectorstore.create_filter_index(property_to_filter= \"metadata.source\", index_name='filter_index')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "c7031279-dfb8-43f2-a7a8-d10a3786023b", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = vectorstore.similarity_search(\n", + " query,\n", + " pre_filter= {\"metadata.source\": { \"$ne\": \"filter content\" } }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "3860be72-d293-43b9-a727-425f166ff6c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "b7fb9800-b1cf-4315-af9d-e8c572d3e05f", + "metadata": {}, + "outputs": [], + "source": [ + "docs = vectorstore.similarity_search(\n", + " query,\n", + " pre_filter= {\"metadata.source\": { \"$ne\": \"../../how_to/state_of_the_union.txt\" } }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "dba9d39e-6220-4fad-84fa-e123aa7ca6e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "50bb4346", + "id": "25ea7250-6e8f-48e6-aac9-196effbdc8d8", "metadata": {}, "outputs": [], "source": [] From 4d7615d0dd4a6196d86e1f40db43ef7035153715 Mon Sep 17 00:00:00 2001 From: "Yang Qiao (from Dev Box)" Date: Wed, 16 Oct 2024 16:32:39 +0800 Subject: [PATCH 4/5] remove duplicate cells in the notebook --- .../vectorstores/azure_cosmos_db.ipynb | 55 ------------------- 1 file changed, 55 deletions(-) diff --git a/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb b/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb index 886d6a28d8ae8..ce5cb92d20054 100644 --- a/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb +++ b/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb @@ -165,61 +165,6 @@ "docs[0]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "610d3faa-c4e7-41cb-8ad7-28673efe9ecf", - "metadata": {}, - "outputs": [], - "source": [ - "from pymongo import MongoClient\n", - "\n", - "# INDEX_NAME = \"izzy-test-index-2\"\n", - "# NAMESPACE = \"izzy_test_db.izzy_test_collection\"\n", - "# DB_NAME, COLLECTION_NAME = NAMESPACE.split(\".\")\n", - "\n", - "client: MongoClient = MongoClient(CONNECTION_STRING)\n", - "collection = client[DB_NAME][COLLECTION_NAME]\n", - "\n", - "model_deployment = os.getenv(\n", - " \"OPENAI_EMBEDDINGS_DEPLOYMENT\", \"smart-agent-embedding-ada\"\n", - ")\n", - "model_name = os.getenv(\"OPENAI_EMBEDDINGS_MODEL_NAME\", \"text-embedding-ada-002\")\n", - "\n", - "vectorstore = AzureCosmosDBVectorSearch.from_documents(\n", - " docs,\n", - " openai_embeddings,\n", - " collection=collection,\n", - " index_name=INDEX_NAME,\n", - ")\n", - "\n", - "# Read more about these variables in detail here. https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search\n", - "num_lists = 100\n", - "dimensions = 1536\n", - "similarity_algorithm = CosmosDBSimilarityType.COS\n", - "kind = CosmosDBVectorSearchType.VECTOR_IVF\n", - "m = 16\n", - "ef_construction = 64\n", - "ef_search = 40\n", - "score_threshold = 0.1\n", - "\n", - "\n", - "# HNSW vectorstore\n", - "dimensions = 1536\n", - "similarity_algorithm = CosmosDBSimilarityType.COS\n", - "kind = CosmosDBVectorSearchType.VECTOR_HNSW\n", - "m = 16\n", - "ef_construction = 64\n", - "\n", - "vectorstore.create_index(\n", - " dimensions=dimensions,\n", - " similarity=similarity_algorithm,\n", - " kind=kind ,\n", - " m=m,\n", - " ef_construction=ef_construction,\n", - " )\n" - ] - }, { "cell_type": "code", "execution_count": 6, From 99923f6fa92bfec590d4446b76f00c1996a2b8d6 Mon Sep 17 00:00:00 2001 From: "Yang Qiao (from Dev Box)" Date: Thu, 7 Nov 2024 12:21:44 +0800 Subject: [PATCH 5/5] update --- .../integrations/vectorstores/azure_cosmos_db.ipynb | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb b/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb index ce5cb92d20054..cdf70e4f42a83 100644 --- a/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb +++ b/docs/docs/integrations/vectorstores/azure_cosmos_db.ipynb @@ -223,7 +223,7 @@ " num_lists, dimensions, similarity_algorithm, kind, m, ef_construction\n", ")\n", "\n", - "'''\n", + "\"\"\"\n", "# DiskANN vectorstore\n", "maxDegree = 40\n", "dimensions = 1536\n", @@ -255,7 +255,7 @@ " m=m,\n", " ef_construction=ef_construction,\n", " )\n", - "'''" + "\"\"\"" ] }, { @@ -418,7 +418,9 @@ ], "source": [ "# create a filter index\n", - "vectorstore.create_filter_index(property_to_filter= \"metadata.source\", index_name='filter_index')" + "vectorstore.create_filter_index(\n", + " property_to_filter=\"metadata.source\", index_name=\"filter_index\"\n", + ")" ] }, { @@ -430,8 +432,7 @@ "source": [ "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs = vectorstore.similarity_search(\n", - " query,\n", - " pre_filter= {\"metadata.source\": { \"$ne\": \"filter content\" } }\n", + " query, pre_filter={\"metadata.source\": {\"$ne\": \"filter content\"}}\n", ")" ] }, @@ -465,7 +466,7 @@ "source": [ "docs = vectorstore.similarity_search(\n", " query,\n", - " pre_filter= {\"metadata.source\": { \"$ne\": \"../../how_to/state_of_the_union.txt\" } }\n", + " pre_filter={\"metadata.source\": {\"$ne\": \"../../how_to/state_of_the_union.txt\"}},\n", ")" ] },