From 729d4b2f8315bc7b8a2046310ac1ca4bbcf24d50 Mon Sep 17 00:00:00 2001 From: Eric Pinzur Date: Thu, 17 Oct 2024 16:19:05 +0200 Subject: [PATCH] updated tests --- .../graph_vectorstores/cassandra.py | 22 +- .../vectorstores/cassandra.py | 37 +- .../graph_vectorstores/test_cassandra.py | 726 +++--- .../test_upgrade_to_cassandra.py | 398 ++-- .../vectorstores/test_cassandra.py | 2001 +++++++++-------- 5 files changed, 1578 insertions(+), 1606 deletions(-) diff --git a/libs/community/langchain_community/graph_vectorstores/cassandra.py b/libs/community/langchain_community/graph_vectorstores/cassandra.py index a540f7d6cb3b80..5b377d61f51120 100644 --- a/libs/community/langchain_community/graph_vectorstores/cassandra.py +++ b/libs/community/langchain_community/graph_vectorstores/cassandra.py @@ -116,7 +116,7 @@ def __init__( *, body_index_options: list[tuple[str, Any]] | None = None, setup_mode: SetupMode = SetupMode.SYNC, - metadata_deny_list: Iterable[str] = [], + metadata_deny_list: Optional[list[str]] = None, ) -> None: """Apache Cassandra(R) for graph-vector-store workloads. @@ -164,9 +164,9 @@ def __init__( """ self.embedding = embedding - deny_list = set(metadata_deny_list) - deny_list.add(METADATA_LINKS_KEY) - self._metadata_deny_list = deny_list + if metadata_deny_list is None: + metadata_deny_list = [] + metadata_deny_list.append(METADATA_LINKS_KEY) self.vector_store = CassandraVectorStore( embedding=embedding, @@ -176,7 +176,7 @@ def __init__( ttl_seconds=ttl_seconds, body_index_options=body_index_options, setup_mode=setup_mode, - metadata_indexing=("deny_list", deny_list), + metadata_indexing=("deny_list", metadata_deny_list), ) store_session: Session = self.vector_store.session @@ -1032,7 +1032,7 @@ def from_texts( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_deny_list: Iterable[str] = [], + metadata_deny_list: Optional[list[str]] = None, **kwargs: Any, ) -> CGVST: """Create a CassandraGraphVectorStore from raw texts. @@ -1094,7 +1094,7 @@ async def afrom_texts( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_deny_list: Iterable[str] = [], + metadata_deny_list: Optional[list[str]] = None, **kwargs: Any, ) -> CGVST: """Create a CassandraGraphVectorStore from raw texts. @@ -1165,7 +1165,7 @@ def from_documents( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_deny_list: Iterable[str] = [], + metadata_deny_list: Optional[list[str]] = None, **kwargs: Any, ) -> CGVST: """Create a CassandraGraphVectorStore from a document list. @@ -1220,7 +1220,7 @@ async def afrom_documents( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_deny_list: Iterable[str] = [], + metadata_deny_list: Optional[list[str]] = None, **kwargs: Any, ) -> CGVST: """Create a CassandraGraphVectorStore from a document list. @@ -1262,5 +1262,7 @@ async def afrom_documents( metadata_deny_list=metadata_deny_list, **kwargs, ) - await store.aadd_documents(documents=cls._add_ids_to_docs(docs=documents, ids=ids)) + await store.aadd_documents( + documents=cls._add_ids_to_docs(docs=documents, ids=ids) + ) return store diff --git a/libs/community/langchain_community/vectorstores/cassandra.py b/libs/community/langchain_community/vectorstores/cassandra.py index ec25ac281c81f2..b5e63d3e3f2f79 100644 --- a/libs/community/langchain_community/vectorstores/cassandra.py +++ b/libs/community/langchain_community/vectorstores/cassandra.py @@ -4,6 +4,7 @@ import importlib.metadata import typing import uuid +import warnings from typing import ( Any, Awaitable, @@ -1165,7 +1166,7 @@ def from_texts( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_indexing: Iterable[str] = [], + metadata_indexing: Union[Tuple[str, Iterable[str]], str] = "all", **kwargs: Any, ) -> CVST: """Create a Cassandra vector store from raw texts. @@ -1229,7 +1230,7 @@ async def afrom_texts( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_indexing: Iterable[str] = [], + metadata_indexing: Union[Tuple[str, Iterable[str]], str] = "all", **kwargs: Any, ) -> CVST: """Create a Cassandra vector store from raw texts. @@ -1302,7 +1303,7 @@ def from_documents( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_indexing: Iterable[str] = [], + metadata_indexing: Union[Tuple[str, Iterable[str]], str] = "all", **kwargs: Any, ) -> CVST: """Create a Cassandra vector store from a document list. @@ -1334,6 +1335,18 @@ def from_documents( Returns: a Cassandra vector store. """ + if ids is not None: + warnings.warn( + ( + "Parameter `ids` to Cassandra's `from_documents` " + "method is deprecated. Please set the supplied documents' " + "`.id` attribute instead. The id attribute of Document " + "is ignored as long as the `ids` parameter is passed." + ), + DeprecationWarning, + stacklevel=2, + ) + store = cls( embedding=embedding, session=session, @@ -1359,7 +1372,7 @@ async def afrom_documents( ids: Optional[List[str]] = None, ttl_seconds: Optional[int] = None, body_index_options: Optional[List[Tuple[str, Any]]] = None, - metadata_indexing: Iterable[str] = [], + metadata_indexing: Union[Tuple[str, Iterable[str]], str] = "all", **kwargs: Any, ) -> CVST: """Create a Cassandra vector store from a document list. @@ -1391,6 +1404,18 @@ async def afrom_documents( Returns: a Cassandra vector store. """ + if ids is not None: + warnings.warn( + ( + "Parameter `ids` to Cassandra's `afrom_documents` " + "method is deprecated. Please set the supplied documents' " + "`.id` attribute instead. The id attribute of Document " + "is ignored as long as the `ids` parameter is passed." + ), + DeprecationWarning, + stacklevel=2, + ) + store = cls( embedding=embedding, session=session, @@ -1402,7 +1427,9 @@ async def afrom_documents( metadata_indexing=metadata_indexing, **kwargs, ) - await store.aadd_documents(documents=cls._add_ids_to_docs(docs=documents, ids=ids)) + await store.aadd_documents( + documents=cls._add_ids_to_docs(docs=documents, ids=ids) + ) return store def as_retriever( diff --git a/libs/community/tests/integration_tests/graph_vectorstores/test_cassandra.py b/libs/community/tests/integration_tests/graph_vectorstores/test_cassandra.py index 80c7e127dfd3f1..9d0a3e6be77a84 100644 --- a/libs/community/tests/integration_tests/graph_vectorstores/test_cassandra.py +++ b/libs/community/tests/integration_tests/graph_vectorstores/test_cassandra.py @@ -1,10 +1,12 @@ """Test of Apache Cassandra graph vector g_store class `CassandraGraphVectorStore`""" import json -import random import os -from typing import Any, Iterable, List, Optional, Tuple, Union +import random +from contextlib import contextmanager +from typing import Any, Generator, Iterable, List, Optional +import pytest from langchain_core.documents import Document from langchain_core.embeddings import Embeddings @@ -15,9 +17,9 @@ Link, add_links, ) - from tests.integration_tests.cache.fake_embeddings import ( - FakeEmbeddings, AngularTwoDimensionalEmbeddings, + AngularTwoDimensionalEmbeddings, + FakeEmbeddings, ) TEST_KEYSPACE = "graph_test_keyspace" @@ -50,7 +52,8 @@ async def aembed_query(self, text: str) -> list[float]: return self.embed_query(text) -def _embedding_d2() -> Embeddings: +@pytest.fixture +def embedding_d2() -> Embeddings: return ParserEmbeddings(dimension=2) @@ -72,7 +75,6 @@ def embed_query(self, text: str) -> list[float]: vector = self.get_vector_near(0.8) else: vector = self.get_vector_near(0.1) - print(f"Embedded {text} as {vector}") return vector async def aembed_query(self, text: str) -> list[float]: @@ -83,9 +85,10 @@ def _result_ids(docs: Iterable[Document]) -> List[Optional[str]]: return [doc.id for doc in docs] -def _graph_vector_store_docs() -> list[Document]: +@pytest.fixture +def graph_vector_store_docs() -> list[Document]: """ - This is a set of Documents to pre-populate a graph vector g_store, + This is a set of Documents to pre-populate a graph vector store, with entries placed in a certain way. Space of the entries (under Euclidean similarity): @@ -146,10 +149,11 @@ def _graph_vector_store_docs() -> list[Document]: return docs_a + docs_b + docs_f + docs_t -def _get_cassandra_session(table_name: str, drop: bool = True) -> Any: +@contextmanager +def cassandra_session(table_name: str, drop: bool = True) -> Generator[Any, None, None]: + # Initialize the Cassandra cluster and session from cassandra.cluster import Cluster - # get db connection if "CASSANDRA_CONTACT_POINTS" in os.environ: contact_points = [ cp.strip() @@ -158,127 +162,95 @@ def _get_cassandra_session(table_name: str, drop: bool = True) -> Any: ] else: contact_points = None + cluster = Cluster(contact_points) session = cluster.connect() - # ensure keyspace exists - session.execute( - ( - f"CREATE KEYSPACE IF NOT EXISTS {TEST_KEYSPACE} " - f"WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}}" - ) - ) - # drop table if required - if drop: - session.execute(f"DROP TABLE IF EXISTS {TEST_KEYSPACE}.{table_name}") - - return session + try: + # Ensure keyspace exists + session.execute( + ( + f"CREATE KEYSPACE IF NOT EXISTS {TEST_KEYSPACE}" + " WITH replication = " + "{{'class': 'SimpleStrategy', 'replication_factor': 1}}" + ) + ) + # Drop table if required + if drop: + session.execute(f"DROP TABLE IF EXISTS {TEST_KEYSPACE}.{table_name}") -def _graphvectorstore_from_texts( - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - drop: bool = True, - metadata_deny_list: Iterable[str] = [], - table_name: str = "graph_test_table", -) -> CassandraGraphVectorStore: - session = _get_cassandra_session(table_name=table_name, drop=drop) - return CassandraGraphVectorStore.from_texts( - texts=texts, - embedding=embedding, - metadatas=metadatas, - ids=ids, - session=session, - keyspace=TEST_KEYSPACE, - table_name=table_name, - metadata_deny_list=metadata_deny_list, - ) + # Yield the session for usage + yield session + finally: + # Ensure proper shutdown/cleanup of resources + session.shutdown() + cluster.shutdown() -async def _graphvectorstore_from_texts_async( - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - drop: bool = True, - metadata_deny_list: Iterable[str] = [], +@pytest.fixture(scope="function") +def graph_vector_store_angular( table_name: str = "graph_test_table", -) -> CassandraGraphVectorStore: - session = _get_cassandra_session(table_name=table_name, drop=drop) - return await CassandraGraphVectorStore.afrom_texts( - texts=texts, - embedding=embedding, - metadatas=metadatas, - ids=ids, - session=session, - keyspace=TEST_KEYSPACE, - table_name=table_name, - metadata_deny_list=metadata_deny_list, - ) +) -> Generator[CassandraGraphVectorStore, None, None]: + with cassandra_session(table_name=table_name) as session: + yield CassandraGraphVectorStore( + embedding=AngularTwoDimensionalEmbeddings(), + session=session, + keyspace=TEST_KEYSPACE, + table_name=table_name, + ) -def _graphvectorstore_from_documents( - docs: List[Document], - embedding: Embeddings, - ids: Optional[List[str]] = None, - drop: bool = True, - metadata_deny_list: Iterable[str] = [], +@pytest.fixture(scope="function") +def graph_vector_store_earth( table_name: str = "graph_test_table", -) -> CassandraGraphVectorStore: - session = _get_cassandra_session(table_name=table_name, drop=drop) - return CassandraGraphVectorStore.from_documents( - documents=docs, - ids=ids, - embedding=embedding, - session=session, - keyspace=TEST_KEYSPACE, - table_name=table_name, - metadata_deny_list=metadata_deny_list, - ) +) -> Generator[CassandraGraphVectorStore, None, None]: + with cassandra_session(table_name=table_name) as session: + yield CassandraGraphVectorStore( + embedding=EarthEmbeddings(), + session=session, + keyspace=TEST_KEYSPACE, + table_name=table_name, + ) -async def _graphvectorstore_from_documents_async( - docs: List[Document], - embedding: Embeddings, - ids: Optional[List[str]] = None, - drop: bool = True, - metadata_deny_list: Iterable[str] = [], +@pytest.fixture(scope="function") +def graph_vector_store_fake( table_name: str = "graph_test_table", -) -> CassandraGraphVectorStore: - session = _get_cassandra_session(table_name=table_name, drop=drop) - return await CassandraGraphVectorStore.afrom_documents( - documents=docs, - ids=ids, - embedding=embedding, - session=session, - keyspace=TEST_KEYSPACE, - table_name=table_name, - metadata_deny_list=metadata_deny_list, - ) +) -> Generator[CassandraGraphVectorStore, None, None]: + with cassandra_session(table_name=table_name) as session: + yield CassandraGraphVectorStore( + embedding=FakeEmbeddings(), + session=session, + keyspace=TEST_KEYSPACE, + table_name=table_name, + ) -def _graph_vector_store_d2( +@pytest.fixture(scope="function") +def graph_vector_store_d2( + embedding_d2: Embeddings, table_name: str = "graph_test_table", -) -> CassandraGraphVectorStore: - session = _get_cassandra_session(table_name=table_name) - return CassandraGraphVectorStore( - embedding=_embedding_d2(), - session=session, - keyspace=TEST_KEYSPACE, - table_name=table_name, - ) +) -> Generator[CassandraGraphVectorStore, None, None]: + with cassandra_session(table_name=table_name) as session: + yield CassandraGraphVectorStore( + embedding=embedding_d2, + session=session, + keyspace=TEST_KEYSPACE, + table_name=table_name, + ) -def _populated_graph_vector_store_d2() -> CassandraGraphVectorStore: - g_store = _graph_vector_store_d2() - g_store.add_documents(_graph_vector_store_docs()) - return g_store +@pytest.fixture(scope="function") +def populated_graph_vector_store_d2( + graph_vector_store_d2: CassandraGraphVectorStore, + graph_vector_store_docs: list[Document], +) -> Generator[CassandraGraphVectorStore, None, None]: + graph_vector_store_d2.add_documents(graph_vector_store_docs) + yield graph_vector_store_d2 -def test_mmr_traversal() -> None: - """ - Test end to end construction and MMR search. +def test_mmr_traversal(graph_vector_store_angular: CassandraGraphVectorStore) -> None: + """ Test end to end construction and MMR search. The embedding function used here ensures `texts` become the following vectors on a circle (numbered v0 through v3): @@ -296,11 +268,6 @@ def test_mmr_traversal() -> None: Both v2 and v3 are reachable via edges from v0, so once it is selected, those are both considered. """ - g_store = _graphvectorstore_from_documents( - docs=[], - embedding=AngularTwoDimensionalEmbeddings(), - ) - v0 = Node( id="v0", text="-0.124", @@ -326,6 +293,8 @@ def test_mmr_traversal() -> None: Link.incoming(kind="explicit", tag="link"), ], ) + + g_store = graph_vector_store_angular g_store.add_nodes([v0, v1, v2, v3]) results = g_store.mmr_traversal_search("0.0", k=2, fetch_k=2) @@ -349,7 +318,9 @@ def test_mmr_traversal() -> None: assert _result_ids(results) == ["v0", "v2", "v1", "v3"] -def test_write_retrieve_keywords() -> None: +def test_write_retrieve_keywords( + graph_vector_store_earth: CassandraGraphVectorStore, +) -> None: greetings = Node( id="greetings", text="Typical Greetings", @@ -378,11 +349,7 @@ def test_write_retrieve_keywords() -> None: ], ) - g_store = _graphvectorstore_from_documents( - docs=[], - embedding=EarthEmbeddings(), - ) - + g_store = graph_vector_store_earth g_store.add_nodes(nodes=[greetings, node1, node2]) # Doc2 is more similar, but World and Earth are similar enough that doc1 also @@ -409,12 +376,7 @@ def test_write_retrieve_keywords() -> None: assert set(_result_ids(results)) == {"doc2", "doc1", "greetings"} -def test_metadata() -> None: - g_store = _graphvectorstore_from_documents( - docs=[], - embedding=FakeEmbeddings(), - ) - +def test_metadata(graph_vector_store_fake: CassandraGraphVectorStore) -> None: doc_a = Node( id="a", text="A", @@ -425,6 +387,7 @@ def test_metadata() -> None: ], ) + g_store = graph_vector_store_fake g_store.add_nodes([doc_a]) results = g_store.similarity_search("A") assert len(results) == 1 @@ -437,243 +400,272 @@ def test_metadata() -> None: } -def test_gvs_similarity_search_sync() -> None: - """Simple (non-graph) similarity search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - ss_response = g_store.similarity_search(query="[2, 10]", k=2) - ss_labels = [doc.metadata["label"] for doc in ss_response] - assert ss_labels == ["AR", "A0"] - ss_by_v_response = g_store.similarity_search_by_vector(embedding=[2, 10], k=2) - ss_by_v_labels = [doc.metadata["label"] for doc in ss_by_v_response] - assert ss_by_v_labels == ["AR", "A0"] - - -async def test_gvs_similarity_search_async() -> None: - """Simple (non-graph) similarity search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - ss_response = await g_store.asimilarity_search(query="[2, 10]", k=2) - ss_labels = [doc.metadata["label"] for doc in ss_response] - assert ss_labels == ["AR", "A0"] - ss_by_v_response = await g_store.asimilarity_search_by_vector( - embedding=[2, 10], k=2 - ) - ss_by_v_labels = [doc.metadata["label"] for doc in ss_by_v_response] - assert ss_by_v_labels == ["AR", "A0"] - - -def test_gvs_traversal_search_sync() -> None: - """Graph traversal search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - ts_response = g_store.traversal_search(query="[2, 10]", k=2, depth=2) - # this is a set, as some of the internals of trav.search are set-driven - # so ordering is not deterministic: - ts_labels = {doc.metadata["label"] for doc in ts_response} - assert ts_labels == {"AR", "A0", "BR", "B0", "TR", "T0"} - - -async def test_gvs_traversal_search_async() -> None: - """Graph traversal search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - ts_labels = set() - async for doc in g_store.atraversal_search(query="[2, 10]", k=2, depth=2): - ts_labels.add(doc.metadata["label"]) - # this is a set, as some of the internals of trav.search are set-driven - # so ordering is not deterministic: - assert ts_labels == {"AR", "A0", "BR", "B0", "TR", "T0"} - - -def test_gvs_mmr_traversal_search_sync() -> None: - """MMR Graph traversal search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - mt_response = g_store.mmr_traversal_search( - query="[2, 10]", - k=2, - depth=2, - fetch_k=1, - adjacent_k=2, - lambda_mult=0.1, - ) - # TODO: can this rightfully be a list (or must it be a set)? - mt_labels = {doc.metadata["label"] for doc in mt_response} - assert mt_labels == {"AR", "BR"} - - -async def test_gvs_mmr_traversal_search_async() -> None: - """MMR Graph traversal search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - mt_labels = set() - async for doc in g_store.ammr_traversal_search( - query="[2, 10]", - k=2, - depth=2, - fetch_k=1, - adjacent_k=2, - lambda_mult=0.1, - ): - mt_labels.add(doc.metadata["label"]) - # TODO: can this rightfully be a list (or must it be a set)? - assert mt_labels == {"AR", "BR"} - - -def test_gvs_metadata_search_sync() -> None: - """Metadata search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - mt_response = g_store.metadata_search( - filter={"label": "T0"}, - n=2, - ) - doc: Document = next(iter(mt_response)) - assert doc.page_content == "[-10, 0]" - links = doc.metadata["links"] - assert len(links) == 1 - link: Link = links.pop() - assert isinstance(link, Link) - assert link.direction == "in" - assert link.kind == "at_example" - assert link.tag == "tag_0" - - -async def test_gvs_metadata_search_async() -> None: - """Metadata search on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - mt_response = await g_store.ametadata_search( - filter={"label": "T0"}, - n=2, - ) - doc: Document = next(iter(mt_response)) - assert doc.page_content == "[-10, 0]" - links: set[Link] = doc.metadata["links"] - assert len(links) == 1 - link: Link = links.pop() - assert isinstance(link, Link) - assert link.direction == "in" - assert link.kind == "at_example" - assert link.tag == "tag_0" - - -def test_gvs_get_by_document_id_sync() -> None: - """Get by document_id on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - doc = g_store.get_by_document_id(document_id="FL") - assert doc is not None - assert doc.page_content == "[1, -9]" - links = doc.metadata["links"] - assert len(links) == 1 - link: Link = links.pop() - assert isinstance(link, Link) - assert link.direction == "out" - assert link.kind == "af_example" - assert link.tag == "tag_l" - - invalid_doc = g_store.get_by_document_id(document_id="invalid") - assert invalid_doc is None - - -async def test_gvs_get_by_document_id_async() -> None: - """Get by document_id on a graph vector g_store.""" - g_store = _populated_graph_vector_store_d2() - doc = await g_store.aget_by_document_id(document_id="FL") - assert doc is not None - assert doc.page_content == "[1, -9]" - links = doc.metadata["links"] - assert len(links) == 1 - link: Link = links.pop() - assert isinstance(link, Link) - assert link.direction == "out" - assert link.kind == "af_example" - assert link.tag == "tag_l" - - invalid_doc = await g_store.aget_by_document_id(document_id="invalid") - assert invalid_doc is None - - -def test_gvs_from_texts() -> None: - g_store = _graphvectorstore_from_texts( - texts=["[1, 2]"], - embedding=_embedding_d2(), - metadatas=[{"md": 1}], - ids=["x_id"], - ) - - hits = g_store.similarity_search("[2, 1]", k=2) - assert len(hits) == 1 - assert hits[0].page_content == "[1, 2]" - assert hits[0].id == "x_id" - # there may be more re:graph structure. - assert hits[0].metadata["md"] == "1.0" - +class TestCassandraGraphVectorStore: + def test_gvs_similarity_search_sync( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Simple (non-graph) similarity search on a graph vector g_store.""" + g_store = populated_graph_vector_store_d2 + ss_response = g_store.similarity_search(query="[2, 10]", k=2) + ss_labels = [doc.metadata["label"] for doc in ss_response] + assert ss_labels == ["AR", "A0"] + ss_by_v_response = g_store.similarity_search_by_vector(embedding=[2, 10], k=2) + ss_by_v_labels = [doc.metadata["label"] for doc in ss_by_v_response] + assert ss_by_v_labels == ["AR", "A0"] + + async def test_gvs_similarity_search_async( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Simple (non-graph) similarity search on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + ss_response = await g_store.asimilarity_search(query="[2, 10]", k=2) + ss_labels = [doc.metadata["label"] for doc in ss_response] + assert ss_labels == ["AR", "A0"] + ss_by_v_response = await g_store.asimilarity_search_by_vector( + embedding=[2, 10], k=2 + ) + ss_by_v_labels = [doc.metadata["label"] for doc in ss_by_v_response] + assert ss_by_v_labels == ["AR", "A0"] + + def test_gvs_traversal_search_sync( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Graph traversal search on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + ts_response = g_store.traversal_search(query="[2, 10]", k=2, depth=2) + # this is a set, as some of the internals of trav.search are set-driven + # so ordering is not deterministic: + ts_labels = {doc.metadata["label"] for doc in ts_response} + assert ts_labels == {"AR", "A0", "BR", "B0", "TR", "T0"} + + async def test_gvs_traversal_search_async( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Graph traversal search on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + ts_labels = set() + async for doc in g_store.atraversal_search(query="[2, 10]", k=2, depth=2): + ts_labels.add(doc.metadata["label"]) + # this is a set, as some of the internals of trav.search are set-driven + # so ordering is not deterministic: + assert ts_labels == {"AR", "A0", "BR", "B0", "TR", "T0"} + + def test_gvs_mmr_traversal_search_sync( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """MMR Graph traversal search on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + mt_response = g_store.mmr_traversal_search( + query="[2, 10]", + k=2, + depth=2, + fetch_k=1, + adjacent_k=2, + lambda_mult=0.1, + ) + # TODO: can this rightfully be a list (or must it be a set)? + mt_labels = {doc.metadata["label"] for doc in mt_response} + assert mt_labels == {"AR", "BR"} + + async def test_gvs_mmr_traversal_search_async( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """MMR Graph traversal search on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + mt_labels = set() + async for doc in g_store.ammr_traversal_search( + query="[2, 10]", + k=2, + depth=2, + fetch_k=1, + adjacent_k=2, + lambda_mult=0.1, + ): + mt_labels.add(doc.metadata["label"]) + # TODO: can this rightfully be a list (or must it be a set)? + assert mt_labels == {"AR", "BR"} + + def test_gvs_metadata_search_sync( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Metadata search on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + mt_response = g_store.metadata_search( + filter={"label": "T0"}, + n=2, + ) + doc: Document = next(iter(mt_response)) + assert doc.page_content == "[-10, 0]" + links = doc.metadata["links"] + assert len(links) == 1 + link: Link = links.pop() + assert isinstance(link, Link) + assert link.direction == "in" + assert link.kind == "at_example" + assert link.tag == "tag_0" + + async def test_gvs_metadata_search_async( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Metadata search on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + mt_response = await g_store.ametadata_search( + filter={"label": "T0"}, + n=2, + ) + doc: Document = next(iter(mt_response)) + assert doc.page_content == "[-10, 0]" + links: set[Link] = doc.metadata["links"] + assert len(links) == 1 + link: Link = links.pop() + assert isinstance(link, Link) + assert link.direction == "in" + assert link.kind == "at_example" + assert link.tag == "tag_0" + + def test_gvs_get_by_document_id_sync( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Get by document_id on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + doc = g_store.get_by_document_id(document_id="FL") + assert doc is not None + assert doc.page_content == "[1, -9]" + links = doc.metadata["links"] + assert len(links) == 1 + link: Link = links.pop() + assert isinstance(link, Link) + assert link.direction == "out" + assert link.kind == "af_example" + assert link.tag == "tag_l" + + invalid_doc = g_store.get_by_document_id(document_id="invalid") + assert invalid_doc is None + + async def test_gvs_get_by_document_id_async( + self, + populated_graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + """Get by document_id on a graph vector store.""" + g_store = populated_graph_vector_store_d2 + doc = await g_store.aget_by_document_id(document_id="FL") + assert doc is not None + assert doc.page_content == "[1, -9]" + links = doc.metadata["links"] + assert len(links) == 1 + link: Link = links.pop() + assert isinstance(link, Link) + assert link.direction == "out" + assert link.kind == "af_example" + assert link.tag == "tag_l" + + invalid_doc = await g_store.aget_by_document_id(document_id="invalid") + assert invalid_doc is None + + def test_gvs_from_texts( + self, + graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + g_store = graph_vector_store_d2 + g_store.add_texts( + texts=["[1, 2]"], + metadatas=[{"md": 1}], + ids=["x_id"], + ) -def test_gvs_from_documents_containing_ids() -> None: - the_document = Document( - page_content="[1, 2]", - metadata={"md": 1}, - id="x_id", - ) - g_store = _graphvectorstore_from_documents( - docs=[the_document], - embedding=_embedding_d2(), - ) - hits = g_store.similarity_search("[2, 1]", k=2) - assert len(hits) == 1 - assert hits[0].page_content == "[1, 2]" - assert hits[0].id == "x_id" - # there may be more re:graph structure. - assert hits[0].metadata["md"] == "1.0" - - -def test_gvs_add_nodes_sync() -> None: - g_store = _graph_vector_store_d2() - links0 = [ - Link(kind="kA", direction="out", tag="tA"), - Link(kind="kB", direction="bidir", tag="tB"), - ] - links1 = [ - Link(kind="kC", direction="in", tag="tC"), - ] - nodes = [ - Node(id="id0", text="[0, 2]", metadata={"m": 0}, links=links0), - Node(text="[0, 1]", metadata={"m": 1}, links=links1), - ] - g_store.add_nodes(nodes) - hits = g_store.similarity_search_by_vector([0, 3]) - assert len(hits) == 2 - assert hits[0].id == "id0" - assert hits[0].page_content == "[0, 2]" - md0 = hits[0].metadata - assert md0["m"] == "0.0" - assert any(isinstance(v, set) for k, v in md0.items() if k != "m") - assert hits[1].id != "id0" - assert hits[1].page_content == "[0, 1]" - md1 = hits[1].metadata - assert md1["m"] == "1.0" - assert any(isinstance(v, set) for k, v in md1.items() if k != "m") - - -async def test_gvs_add_nodes_async() -> None: - g_store = _graph_vector_store_d2() - links0 = [ - Link(kind="kA", direction="out", tag="tA"), - Link(kind="kB", direction="bidir", tag="tB"), - ] - links1 = [ - Link(kind="kC", direction="in", tag="tC"), - ] - nodes = [ - Node(id="id0", text="[0, 2]", metadata={"m": 0}, links=links0), - Node(text="[0, 1]", metadata={"m": 1}, links=links1), - ] - async for _ in g_store.aadd_nodes(nodes): - pass - - hits = await g_store.asimilarity_search_by_vector([0, 3]) - assert len(hits) == 2 - assert hits[0].id == "id0" - assert hits[0].page_content == "[0, 2]" - md0 = hits[0].metadata - assert md0["m"] == "0.0" - assert any(isinstance(v, set) for k, v in md0.items() if k != "m") - assert hits[1].id != "id0" - assert hits[1].page_content == "[0, 1]" - md1 = hits[1].metadata - assert md1["m"] == "1.0" - assert any(isinstance(v, set) for k, v in md1.items() if k != "m") + hits = g_store.similarity_search("[2, 1]", k=2) + assert len(hits) == 1 + assert hits[0].page_content == "[1, 2]" + assert hits[0].id == "x_id" + # there may be more re:graph structure. + assert hits[0].metadata["md"] == "1.0" + + def test_gvs_from_documents_containing_ids( + self, + graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + the_document = Document( + page_content="[1, 2]", + metadata={"md": 1}, + id="x_id", + ) + g_store = graph_vector_store_d2 + g_store.add_documents([the_document]) + hits = g_store.similarity_search("[2, 1]", k=2) + assert len(hits) == 1 + assert hits[0].page_content == "[1, 2]" + assert hits[0].id == "x_id" + # there may be more re:graph structure. + assert hits[0].metadata["md"] == "1.0" + + def test_gvs_add_nodes_sync( + self, + *, + graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + links0 = [ + Link(kind="kA", direction="out", tag="tA"), + Link(kind="kB", direction="bidir", tag="tB"), + ] + links1 = [ + Link(kind="kC", direction="in", tag="tC"), + ] + nodes = [ + Node(id="id0", text="[0, 2]", metadata={"m": 0}, links=links0), + Node(text="[0, 1]", metadata={"m": 1}, links=links1), + ] + graph_vector_store_d2.add_nodes(nodes) + hits = graph_vector_store_d2.similarity_search_by_vector([0, 3]) + assert len(hits) == 2 + assert hits[0].id == "id0" + assert hits[0].page_content == "[0, 2]" + md0 = hits[0].metadata + assert md0["m"] == "0.0" + assert any(isinstance(v, set) for k, v in md0.items() if k != "m") + + assert hits[1].id != "id0" + assert hits[1].page_content == "[0, 1]" + md1 = hits[1].metadata + assert md1["m"] == "1.0" + assert any(isinstance(v, set) for k, v in md1.items() if k != "m") + + async def test_gvs_add_nodes_async( + self, + *, + graph_vector_store_d2: CassandraGraphVectorStore, + ) -> None: + links0 = [ + Link(kind="kA", direction="out", tag="tA"), + Link(kind="kB", direction="bidir", tag="tB"), + ] + links1 = [ + Link(kind="kC", direction="in", tag="tC"), + ] + nodes = [ + Node(id="id0", text="[0, 2]", metadata={"m": 0}, links=links0), + Node(text="[0, 1]", metadata={"m": 1}, links=links1), + ] + async for _ in graph_vector_store_d2.aadd_nodes(nodes): + pass + + hits = await graph_vector_store_d2.asimilarity_search_by_vector([0, 3]) + assert len(hits) == 2 + assert hits[0].id == "id0" + assert hits[0].page_content == "[0, 2]" + md0 = hits[0].metadata + assert md0["m"] == "0.0" + assert any(isinstance(v, set) for k, v in md0.items() if k != "m") + assert hits[1].id != "id0" + assert hits[1].page_content == "[0, 1]" + md1 = hits[1].metadata + assert md1["m"] == "1.0" + assert any(isinstance(v, set) for k, v in md1.items() if k != "m") diff --git a/libs/community/tests/integration_tests/graph_vectorstores/test_upgrade_to_cassandra.py b/libs/community/tests/integration_tests/graph_vectorstores/test_upgrade_to_cassandra.py index 8305f768dcea21..e435de399679e9 100644 --- a/libs/community/tests/integration_tests/graph_vectorstores/test_upgrade_to_cassandra.py +++ b/libs/community/tests/integration_tests/graph_vectorstores/test_upgrade_to_cassandra.py @@ -7,18 +7,17 @@ import json import os -from typing import TYPE_CHECKING, Any, Iterable, Tuple, Union +from contextlib import contextmanager +from typing import Any, Generator, Iterable, Optional, Tuple, Union import pytest from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings from langchain_community.graph_vectorstores import CassandraGraphVectorStore from langchain_community.utilities.cassandra import SetupMode from langchain_community.vectorstores import Cassandra -if TYPE_CHECKING: - from langchain_core.embeddings import Embeddings - TEST_KEYSPACE = "graph_test_keyspace" TABLE_NAME_ALLOW_INDEXING = "allow_graph_table" @@ -57,10 +56,11 @@ def _embedding_d2() -> Embeddings: return ParserEmbeddings(dimension=2) -def _get_cassandra_session(table_name: str, drop: bool = True) -> Any: +@contextmanager +def cassandra_session(table_name: str, drop: bool = True) -> Generator[Any, None, None]: + # Initialize the Cassandra cluster and session from cassandra.cluster import Cluster - # get db connection if "CASSANDRA_CONTACT_POINTS" in os.environ: contact_points = [ cp.strip() @@ -69,278 +69,194 @@ def _get_cassandra_session(table_name: str, drop: bool = True) -> Any: ] else: contact_points = None + cluster = Cluster(contact_points) session = cluster.connect() - # ensure keyspace exists - session.execute( - ( - f"CREATE KEYSPACE IF NOT EXISTS {TEST_KEYSPACE} " - f"WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}}" + + try: + # Ensure keyspace exists + session.execute( + ( + f"CREATE KEYSPACE IF NOT EXISTS {TEST_KEYSPACE}" + " WITH replication = " + "{{'class': 'SimpleStrategy', 'replication_factor': 1}}" + ) ) - ) - # drop table if required - if drop: - session.execute(f"DROP TABLE IF EXISTS {TEST_KEYSPACE}.{table_name}") + # Drop table if required + if drop: + session.execute(f"DROP TABLE IF EXISTS {TEST_KEYSPACE}.{table_name}") - return session + # Yield the session for usage + yield session + finally: + # Ensure proper shutdown/cleanup of resources + session.shutdown() + cluster.shutdown() -def _get_vector_store( +@contextmanager +def vector_store( table_name: str, setup_mode: SetupMode, metadata_indexing: Union[Tuple[str, Iterable[str]], str] = "all", drop: bool = True, -) -> Cassandra: - session = _get_cassandra_session(table_name=table_name, drop=drop) - return Cassandra( - table_name=table_name, - keyspace=TEST_KEYSPACE, - session=session, - embedding=_embedding_d2(), - setup_mode=setup_mode, - metadata_indexing=metadata_indexing, - ) - - -def _get_graph_vector_store( +) -> Generator[Cassandra, None, None]: + # Open a session with the context manager + with cassandra_session(table_name=table_name, drop=drop) as session: + try: + # Yield the Cassandra instance with the open session + yield Cassandra( + table_name=table_name, + keyspace=TEST_KEYSPACE, + session=session, # Pass the session to Cassandra + embedding=_embedding_d2(), + setup_mode=setup_mode, + metadata_indexing=metadata_indexing, + ) + finally: + # Cleanup happens in cassandra_session context manager automatically + pass + + +@contextmanager +def graph_vector_store( table_name: str, setup_mode: SetupMode, - metadata_deny_list: Iterable[str], + metadata_deny_list: Optional[list[str]] = None, drop: bool = True, -) -> CassandraGraphVectorStore: - session = _get_cassandra_session(table_name=table_name, drop=drop) - return CassandraGraphVectorStore( - table_name=table_name, - keyspace=TEST_KEYSPACE, - session=session, - embedding=_embedding_d2(), - setup_mode=setup_mode, - metadata_deny_list=metadata_deny_list, - ) +) -> Generator[CassandraGraphVectorStore, None, None]: + # Open a session with the context manager + with cassandra_session(table_name=table_name, drop=drop) as session: + try: + yield CassandraGraphVectorStore( + table_name=table_name, + keyspace=TEST_KEYSPACE, + session=session, + embedding=_embedding_d2(), + setup_mode=setup_mode, + metadata_deny_list=metadata_deny_list, + ) + finally: + # Cleanup happens in cassandra_session context manager automatically + pass def _vs_indexing_policy(table_name: str) -> Union[Tuple[str, Iterable[str]], str]: if table_name == TABLE_NAME_ALLOW_INDEXING: - return {"allowlist": ["test"]} + return ("allowlist", ["test"]) if table_name == TABLE_NAME_DEFAULT: return "all" if table_name == TABLE_NAME_DENY_INDEXING: - return {"denylist": ["test"]} + return ("denylist", ["test"]) msg = f"Unknown table_name: {table_name} in _vs_indexing_policy()" raise ValueError(msg) -@pytest.mark.parametrize( - ("table_name", "gvs_setup_mode", "gvs_metadata_deny_list"), - [ - (TABLE_NAME_DEFAULT, SetupMode.SYNC, None), - (TABLE_NAME_DENY_INDEXING, SetupMode.SYNC, {"deny": ["test"]}), - (TABLE_NAME_DEFAULT, SetupMode.OFF, None), - (TABLE_NAME_DENY_INDEXING, SetupMode.OFF, {"deny": ["test"]}), - # for this one, even though the passed policy doesn't - # match the policy used to create the collection, - # there is no error since the SetupMode is OFF and - # and no attempt is made to re-create the collection. - (TABLE_NAME_DENY_INDEXING, SetupMode.OFF, None), - ], - ids=[ - "default_upgrade_no_policy_sync", - "deny_list_upgrade_same_policy_sync", - "default_upgrade_no_policy_off", - "deny_list_upgrade_same_policy_off", - "deny_list_upgrade_change_policy_off", - ], -) -def test_upgrade_to_gvs_success_sync( - table_name: str, - gvs_setup_mode: SetupMode, - gvs_metadata_deny_list: Iterable[str], -) -> None: - # Create vector store using SetupMode.SYNC - v_store = _get_vector_store( - table_name=table_name, - setup_mode=SetupMode.SYNC, - metadata_indexing=_vs_indexing_policy(table_name=table_name), - drop=True, - ) - - # load a document to the vector store - doc_id = "AL" - doc_al = Document(id=doc_id, page_content="[-1, 9]", metadata={"label": "AL"}) - v_store.add_documents([doc_al]) - - # get the document from the vector store - v_doc = v_store.get_by_document_id(document_id=doc_id) - assert v_doc is not None - assert v_doc.page_content == doc_al.page_content - - # Create a GRAPH Vector Store using the existing collection from above - # with setup_mode=gvs_setup_mode and indexing_policy=gvs_indexing_policy - gv_store = _get_graph_vector_store( - table_name=table_name, - setup_mode=gvs_setup_mode, - metadata_deny_list=gvs_metadata_deny_list, - drop=False, - ) - - # get the document from the GRAPH vector store - gv_doc = gv_store.get_by_document_id(document_id=doc_id) - assert gv_doc is not None - assert gv_doc.page_content == doc_al.page_content - - -@pytest.mark.parametrize( - ("table_name", "gvs_setup_mode", "gvs_metadata_deny_list"), - [ - (TABLE_NAME_DEFAULT, SetupMode.ASYNC, None), - (TABLE_NAME_DENY_INDEXING, SetupMode.ASYNC, {"deny": ["test"]}), - ], - ids=[ - "default_upgrade_no_policy_async", - "deny_list_upgrade_same_policy_async", - ], -) -async def test_upgrade_to_gvs_success_async( - gvs_setup_mode: SetupMode, - table_name: str, - gvs_metadata_deny_list: Iterable[str], -) -> None: - # Create vector store using SetupMode.ASYNC - v_store = _get_vector_store( - table_name=table_name, - setup_mode=SetupMode.ASYNC, - metadata_indexing=_vs_indexing_policy(table_name=table_name), - drop=True, - ) - - # load a document to the vector store - doc_id = "AL" - doc_al = Document(id=doc_id, page_content="[-1, 9]", metadata={"label": "AL"}) - await v_store.aadd_documents([doc_al]) - - # get the document from the vector store - v_doc = await v_store.aget_by_document_id(document_id=doc_id) - assert v_doc is not None - assert v_doc.page_content == doc_al.page_content - - # Create a GRAPH Vector Store using the existing collection from above - # with setup_mode=gvs_setup_mode and indexing_policy=gvs_indexing_policy - gv_store = _get_graph_vector_store( - table_name=table_name, - setup_mode=gvs_setup_mode, - metadata_deny_list=gvs_metadata_deny_list, - drop=False, - ) - - # get the document from the GRAPH vector store - gv_doc = await gv_store.aget_by_document_id(document_id=doc_id) - assert gv_doc is not None - assert gv_doc.page_content == doc_al.page_content - - -@pytest.mark.parametrize( - ("table_name", "gvs_setup_mode", "gvs_metadata_deny_list"), - [ - (TABLE_NAME_ALLOW_INDEXING, SetupMode.SYNC, {"allow": ["test"]}), - (TABLE_NAME_ALLOW_INDEXING, SetupMode.SYNC, None), - (TABLE_NAME_DENY_INDEXING, SetupMode.SYNC, None), - (TABLE_NAME_ALLOW_INDEXING, SetupMode.OFF, {"allow": ["test"]}), - (TABLE_NAME_ALLOW_INDEXING, SetupMode.OFF, None), - ], - ids=[ - "allow_list_upgrade_same_policy_sync", - "allow_list_upgrade_change_policy_sync", - "deny_list_upgrade_change_policy_sync", - "allow_list_upgrade_same_policy_off", - "allow_list_upgrade_change_policy_off", - ], -) -def test_upgrade_to_gvs_failure_sync( - gvs_setup_mode: SetupMode, - table_name: str, - gvs_metadata_deny_list: Iterable[str], -) -> None: - # Create vector store using SetupMode.SYNC - v_store = _get_vector_store( - table_name=table_name, - setup_mode=SetupMode.SYNC, - metadata_indexing=_vs_indexing_policy(table_name=table_name), - drop=True, +class TestUpgradeToGraphVectorStore: + @pytest.mark.parametrize( + ("table_name", "gvs_setup_mode", "gvs_metadata_deny_list"), + [ + (TABLE_NAME_DEFAULT, SetupMode.SYNC, None), + (TABLE_NAME_DENY_INDEXING, SetupMode.SYNC, ["test"]), + (TABLE_NAME_DEFAULT, SetupMode.OFF, None), + (TABLE_NAME_DENY_INDEXING, SetupMode.OFF, ["test"]), + # for this one, even though the passed policy doesn't + # match the policy used to create the collection, + # there is no error since the SetupMode is OFF and + # and no attempt is made to re-create the collection. + (TABLE_NAME_DENY_INDEXING, SetupMode.OFF, None), + ], + ids=[ + "default_upgrade_no_policy_sync", + "deny_list_upgrade_same_policy_sync", + "default_upgrade_no_policy_off", + "deny_list_upgrade_same_policy_off", + "deny_list_upgrade_change_policy_off", + ], ) + def test_upgrade_to_gvs_success_sync( + self, + *, + gvs_setup_mode: SetupMode, + table_name: str, + gvs_metadata_deny_list: list[str], + ) -> None: + doc_id = "AL" + doc_al = Document(id=doc_id, page_content="[-1, 9]", metadata={"label": "AL"}) + + # Create vector store using SetupMode.SYNC + with vector_store( + table_name=table_name, + setup_mode=SetupMode.SYNC, + metadata_indexing=_vs_indexing_policy(table_name=table_name), + drop=True, + ) as v_store: + # load a document to the vector store + v_store.add_documents([doc_al]) + + # get the document from the vector store + v_doc = v_store.get_by_document_id(document_id=doc_id) + assert v_doc is not None + assert v_doc.page_content == doc_al.page_content - # load a document to the vector store - doc_id = "AL" - doc_al = Document(id=doc_id, page_content="[-1, 9]", metadata={"label": "AL"}) - v_store.add_documents([doc_al]) - - # get the document from the vector store - v_doc = v_store.get_by_document_id(document_id=doc_id) - assert v_doc is not None - assert v_doc.page_content == doc_al.page_content - - expected_msg = ( - "The collection configuration is incompatible with vector graph " - "store. Please create a new collection and make sure the path " - "`incoming_links` is not excluded by indexing." - ) - with pytest.raises(ValueError, match=expected_msg): # Create a GRAPH Vector Store using the existing collection from above # with setup_mode=gvs_setup_mode and indexing_policy=gvs_indexing_policy - _ = _get_graph_vector_store( + with graph_vector_store( table_name=table_name, setup_mode=gvs_setup_mode, metadata_deny_list=gvs_metadata_deny_list, drop=False, - ) - - -@pytest.mark.parametrize( - ("table_name", "gvs_setup_mode", "gvs_metadata_deny_list"), - [ - (TABLE_NAME_ALLOW_INDEXING, SetupMode.ASYNC, {"allow": ["test"]}), - (TABLE_NAME_ALLOW_INDEXING, SetupMode.ASYNC, None), - (TABLE_NAME_DENY_INDEXING, SetupMode.ASYNC, None), - ], - ids=[ - "allow_list_upgrade_same_policy_async", - "allow_list_upgrade_change_policy_async", - "deny_list_upgrade_change_policy_async", - ], -) -async def test_upgrade_to_gvs_failure_async( - gvs_setup_mode: SetupMode, - table_name: str, - gvs_metadata_deny_list: Iterable[str], -) -> None: - # Create vector store using SetupMode.ASYNC - v_store = _get_vector_store( - table_name=table_name, - setup_mode=SetupMode.ASYNC, - metadata_indexing=_vs_indexing_policy(table_name=table_name), - drop=True, + ) as gv_store: + # get the document from the GRAPH vector store + gv_doc = gv_store.get_by_document_id(document_id=doc_id) + assert gv_doc is not None + assert gv_doc.page_content == doc_al.page_content + + @pytest.mark.parametrize( + ("table_name", "gvs_setup_mode", "gvs_metadata_deny_list"), + [ + (TABLE_NAME_DEFAULT, SetupMode.ASYNC, None), + (TABLE_NAME_DENY_INDEXING, SetupMode.ASYNC, ["test"]), + ], + ids=[ + "default_upgrade_no_policy_async", + "deny_list_upgrade_same_policy_async", + ], ) + async def test_upgrade_to_gvs_success_async( + self, + *, + gvs_setup_mode: SetupMode, + table_name: str, + gvs_metadata_deny_list: list[str], + ) -> None: + doc_id = "AL" + doc_al = Document(id=doc_id, page_content="[-1, 9]", metadata={"label": "AL"}) + + # Create vector store using SetupMode.ASYNC + with vector_store( + table_name=table_name, + setup_mode=SetupMode.ASYNC, + metadata_indexing=_vs_indexing_policy(table_name=table_name), + drop=True, + ) as v_store: + # load a document to the vector store + await v_store.aadd_documents([doc_al]) + + # get the document from the vector store + v_doc = await v_store.aget_by_document_id(document_id=doc_id) + assert v_doc is not None + assert v_doc.page_content == doc_al.page_content - # load a document to the vector store - doc_id = "AL" - doc_al = Document(id=doc_id, page_content="[-1, 9]", metadata={"label": "AL"}) - await v_store.aadd_documents([doc_al]) - - # get the document from the vector store - v_doc = await v_store.aget_by_document_id(document_id=doc_id) - assert v_doc is not None - assert v_doc.page_content == doc_al.page_content - - expected_msg = ( - "The collection configuration is incompatible with vector graph " - "store. Please create a new collection and make sure the path " - "`incoming_links` is not excluded by indexing." - ) - with pytest.raises(ValueError, match=expected_msg): # Create a GRAPH Vector Store using the existing collection from above # with setup_mode=gvs_setup_mode and indexing_policy=gvs_indexing_policy - _ = _get_graph_vector_store( + with graph_vector_store( table_name=table_name, setup_mode=gvs_setup_mode, metadata_deny_list=gvs_metadata_deny_list, drop=False, - ) + ) as gv_store: + # get the document from the GRAPH vector store + gv_doc = await gv_store.aget_by_document_id(document_id=doc_id) + assert gv_doc is not None + assert gv_doc.page_content == doc_al.page_content diff --git a/libs/community/tests/integration_tests/vectorstores/test_cassandra.py b/libs/community/tests/integration_tests/vectorstores/test_cassandra.py index 157b888f4798c0..d76eeea95f5cc3 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_cassandra.py +++ b/libs/community/tests/integration_tests/vectorstores/test_cassandra.py @@ -50,11 +50,13 @@ async def aembed_query(self, text: str) -> list[float]: return self.embed_query(text) -def _embedding_d2() -> Embeddings: +@pytest.fixture +def embedding_d2() -> Embeddings: return ParserEmbeddings(dimension=2) -def _metadata_documents() -> list[Document]: +@pytest.fixture +def metadata_documents() -> list[Document]: """Documents for metadata and id tests""" return [ Document( @@ -90,17 +92,6 @@ def _metadata_documents() -> list[Document]: ] -def _strip_docs(documents: List[Document]) -> List[Document]: - return [_strip_doc(doc) for doc in documents] - - -def _strip_doc(document: Document) -> Document: - return Document( - page_content=document.page_content, - metadata=document.metadata, - ) - - def _get_cassandra_session(table_name: str, drop: bool) -> Any: from cassandra.cluster import Cluster @@ -213,6 +204,14 @@ async def _vectorstore_from_documents_async( ) +@pytest.fixture +def vector_store_d2(embedding_d2: Embeddings) -> Cassandra: + return _vectorstore_from_documents( + docs=[], + embedding=embedding_d2, + ) + + def test_cassandra_add_texts() -> None: """Test end to end construction with further insertions.""" texts = ["foo", "bar", "baz"] @@ -388,1001 +387,1037 @@ def test_cassandra_metadata_indexing() -> None: vstore_f1.similarity_search("bar", filter={"field2": "b"}, k=2) -def test_cassandra_vectorstore_from_texts_sync() -> None: - """from_texts methods and the associated warnings.""" - page_contents = [ - "[1,2]", - "[3,4]", - "[5,6]", - "[7,8]", - "[9,10]", - "[11,12]", - ] - table_name = "empty_collection_d2" - - v_store = _vectorstore_from_texts( - texts=page_contents[0:2], - metadatas=[{"m": 1}, {"m": 3}], - embedding=_embedding_d2(), - ids=["ft1", "ft3"], - table_name=table_name, - ) - search_results_triples_0 = v_store.similarity_search_with_score_id( - page_contents[1], - k=1, - ) - assert len(search_results_triples_0) == 1 - res_doc_0, _, res_id_0 = search_results_triples_0[0] - assert res_doc_0.page_content == page_contents[1] - assert res_doc_0.metadata == {"m": "3.0"} - assert res_id_0 == "ft3" - - _vectorstore_from_texts( - texts=page_contents[2:4], - metadatas=[{"m": 5}, {"m": 7}], - embedding=_embedding_d2(), - ids=["ft5", "ft7"], - table_name=table_name, - drop=False, - ) - search_results_triples_1 = v_store.similarity_search_with_score_id( - page_contents[3], - k=1, - ) - assert len(search_results_triples_1) == 1 - res_doc_1, _, res_id_1 = search_results_triples_1[0] - assert res_doc_1.page_content == page_contents[3] - assert res_doc_1.metadata == {"m": "7.0"} - assert res_id_1 == "ft7" - - v_store_2 = _vectorstore_from_texts( - texts=page_contents[4:6], - metadatas=[{"m": 9}, {"m": 11}], - embedding=_embedding_d2(), - ids=["ft9", "ft11"], - table_name=table_name, - drop=False, - ) - search_results_triples_2 = v_store_2.similarity_search_with_score_id( - page_contents[5], - k=1, - ) - assert len(search_results_triples_2) == 1 - res_doc_2, _, res_id_2 = search_results_triples_2[0] - assert res_doc_2.page_content == page_contents[5] - assert res_doc_2.metadata == {"m": "11.0"} - assert res_id_2 == "ft11" - - -def test_cassandra_vectorstore_from_documents_sync() -> None: - page_contents = ["[1,2]", "[3,4]"] - table_name = "empty_collection_d2" - - pc1, pc2 = page_contents - - # no IDs. - docs_1 = [ - Document(page_content=pc1, metadata={"m": 1}), - Document(page_content=pc2, metadata={"m": 3}), - ] - - v_store_1 = _vectorstore_from_documents( - docs=docs_1, - embedding=_embedding_d2(), - table_name=table_name, - ) - hits = v_store_1.similarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - v_store_1.clear() - - # IDs passed separately. - docs_2 = [ - Document(page_content=pc1, metadata={"m": 1}), - Document(page_content=pc2, metadata={"m": 3}), - ] - - v_store_2 = _vectorstore_from_documents( - docs=docs_2, - embedding=_embedding_d2(), - table_name=table_name, - drop=False, - ids=["idx1", "idx3"], - ) - hits = v_store_2.similarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - assert hits[0].id == "idx3" - v_store_2.clear() - - # IDs in documents. - docs_3 = [ - Document(page_content=pc1, metadata={"m": 1}, id="idx1"), - Document(page_content=pc2, metadata={"m": 3}, id="idx3"), - ] - - v_store_3 = _vectorstore_from_documents( - docs=docs_3, - embedding=_embedding_d2(), - table_name=table_name, - drop=False, - ) - hits = v_store_3.similarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - assert hits[0].id == "idx3" - v_store_3.clear() - - # IDs both in documents and aside. - docs_4 = [ - Document(page_content=pc1, metadata={"m": 1}), - Document(page_content=pc2, metadata={"m": 3}, id="idy3"), - ] - - v_store_4 = _vectorstore_from_documents( - docs=docs_4, - embedding=_embedding_d2(), - table_name=table_name, - ids=["idx1", "idx3"], - drop=False, - ) - hits = v_store_4.similarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - assert hits[0].id == "idx3" - v_store_4.clear() - - -async def test_cassandra_vectorstore_from_texts_async() -> None: - """from_texts methods and the associated warnings, async version.""" - page_contents = [ - "[1,2]", - "[3,4]", - "[5,6]", - "[7,8]", - "[9,10]", - "[11,12]", - ] - table_name = "empty_collection_d2" - - v_store = await _vectorstore_from_texts_async( - texts=page_contents[0:2], - metadatas=[{"m": 1}, {"m": 3}], - ids=["ft1", "ft3"], - embedding=_embedding_d2(), - table_name=table_name, - ) - search_results_triples_0 = await v_store.asimilarity_search_with_score_id( - page_contents[1], - k=1, - ) - assert len(search_results_triples_0) == 1 - res_doc_0, _, res_id_0 = search_results_triples_0[0] - assert res_doc_0.page_content == page_contents[1] - assert res_doc_0.metadata == {"m": "3.0"} - assert res_id_0 == "ft3" - - await _vectorstore_from_texts_async( - texts=page_contents[2:4], - metadatas=[{"m": 5}, {"m": 7}], - ids=["ft5", "ft7"], - embedding=_embedding_d2(), - table_name=table_name, - drop=False, - ) - search_results_triples_1 = await v_store.asimilarity_search_with_score_id( - page_contents[3], - k=1, - ) - assert len(search_results_triples_1) == 1 - res_doc_1, _, res_id_1 = search_results_triples_1[0] - assert res_doc_1.page_content == page_contents[3] - assert res_doc_1.metadata == {"m": "7.0"} - assert res_id_1 == "ft7" - - v_store_2 = await _vectorstore_from_texts_async( - texts=page_contents[4:6], - metadatas=[{"m": 9}, {"m": 11}], - ids=["ft9", "ft11"], - embedding=_embedding_d2(), - table_name=table_name, - drop=False, - ) - search_results_triples_2 = await v_store_2.asimilarity_search_with_score_id( - page_contents[5], - k=1, - ) - assert len(search_results_triples_2) == 1 - res_doc_2, _, res_id_2 = search_results_triples_2[0] - assert res_doc_2.page_content == page_contents[5] - assert res_doc_2.metadata == {"m": "11.0"} - assert res_id_2 == "ft11" - - -async def test_cassandra_vectorstore_from_documents_async() -> None: - """ - from_documents, esp. the various handling of ID-in-doc vs external. - Async version. - """ - page_contents = ["[1,2]", "[3,4]"] - table_name = "empty_collection_d2" - - pc1, pc2 = page_contents - - # no IDs. - docs_1 = [ - Document(page_content=pc1, metadata={"m": 1}), - Document(page_content=pc2, metadata={"m": 3}), - ] - - v_store_1 = await _vectorstore_from_documents_async( - docs=docs_1, - embedding=_embedding_d2(), - table_name=table_name, - ) - hits = await v_store_1.asimilarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - await v_store_1.aclear() - - # IDs passed separately. - docs_2 = [ - Document(page_content=pc1, metadata={"m": 1}), - Document(page_content=pc2, metadata={"m": 3}), - ] - - v_store_2 = await _vectorstore_from_documents_async( - docs=docs_2, - embedding=_embedding_d2(), - table_name=table_name, - drop=False, - ids=["idx1", "idx3"], - ) - hits = await v_store_2.asimilarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - assert hits[0].id == "idx3" - await v_store_2.aclear() - - # IDs in documents. - docs_3 = [ - Document(page_content=pc1, metadata={"m": 1}, id="idx1"), - Document(page_content=pc2, metadata={"m": 3}, id="idx3"), - ] - - v_store_3 = await _vectorstore_from_documents_async( - docs=docs_3, - embedding=_embedding_d2(), - table_name=table_name, - drop=False, - ) - hits = await v_store_3.asimilarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - assert hits[0].id == "idx3" - await v_store_3.aclear() - - # IDs both in documents and aside. - docs_4 = [ - Document(page_content=pc1, metadata={"m": 1}), - Document(page_content=pc2, metadata={"m": 3}, id="idy3"), - ] - - v_store_4 = await _vectorstore_from_documents_async( - docs=docs_4, - embedding=_embedding_d2(), - table_name=table_name, - ids=["idx1", "idx3"], - drop=False, - ) - hits = await v_store_4.asimilarity_search(pc2, k=1) - assert len(hits) == 1 - assert hits[0].page_content == pc2 - assert hits[0].metadata == {"m": "3.0"} - assert hits[0].id == "idx3" - await v_store_4.aclear() - - -def test_cassandra_vectorstore_crud_sync() -> None: - """Add/delete/update behaviour.""" - vstore = _vectorstore_from_documents( - docs=[], - embedding=_embedding_d2(), - ) - - res0 = vstore.similarity_search("[-1,-1]", k=2) - assert res0 == [] - # write and check again - added_ids = vstore.add_texts( - texts=["[1,2]", "[3,4]", "[5,6]"], - metadatas=[ - {"k": "a", "ord": 0}, - {"k": "b", "ord": 1}, - {"k": "c", "ord": 2}, - ], - ids=["a", "b", "c"], - ) - # not requiring ordered match (elsewhere it may be overwriting some) - assert set(added_ids) == {"a", "b", "c"} - res1 = vstore.similarity_search("[-1,-1]", k=5) - assert {doc.page_content for doc in res1} == {"[1,2]", "[3,4]", "[5,6]"} - res2 = vstore.similarity_search("[3,4]", k=1) - assert len(res2) == 1 - assert res2[0].page_content == "[3,4]" - assert res2[0].metadata == {"k": "b", "ord": "1.0"} - assert res2[0].id == "b" - # partial overwrite and count total entries - added_ids_1 = vstore.add_texts( - texts=["[5,6]", "[7,8]"], - metadatas=[ - {"k": "c_new", "ord": 102}, - {"k": "d_new", "ord": 103}, - ], - ids=["c", "d"], - ) - # not requiring ordered match (elsewhere it may be overwriting some) - assert set(added_ids_1) == {"c", "d"} - res2 = vstore.similarity_search("[-1,-1]", k=10) - assert len(res2) == 4 - # pick one that was just updated and check its metadata - res3 = vstore.similarity_search_with_score_id( - query="[5,6]", k=1, filter={"k": "c_new"} - ) - doc3, _, id3 = res3[0] - assert doc3.page_content == "[5,6]" - assert doc3.metadata == {"k": "c_new", "ord": "102.0"} - assert id3 == "c" - # delete and count again - del1_res = vstore.delete(["b"]) - assert del1_res is True - del2_res = vstore.delete(["a", "c", "Z!"]) - assert del2_res is True # a non-existing ID was supplied - assert len(vstore.similarity_search("[-1,-1]", k=10)) == 1 - # clear store - vstore.clear() - assert vstore.similarity_search("[-1,-1]", k=2) == [] - # add_documents with "ids" arg passthrough - vstore.add_documents( +class TestCassandraVectorStore: + @pytest.mark.parametrize( + "page_contents", [ - Document(page_content="[9,10]", metadata={"k": "v", "ord": 204}), - Document(page_content="[11,12]", metadata={"k": "w", "ord": 205}), + [ + "[1,2]", + "[3,4]", + "[5,6]", + "[7,8]", + "[9,10]", + "[11,12]", + ], ], - ids=["v", "w"], - ) - assert len(vstore.similarity_search("[-1,-1]", k=10)) == 2 - res4 = vstore.similarity_search("[11,12]", k=1, filter={"k": "w"}) - assert res4[0].metadata["ord"] == "205.0" - assert res4[0].id == "w" - # add_texts with "ids" arg passthrough - vstore.add_texts( - texts=["[13,14]", "[15,16]"], - metadatas=[{"k": "r", "ord": 306}, {"k": "s", "ord": 307}], - ids=["r", "s"], - ) - assert len(vstore.similarity_search("[-1,-1]", k=10)) == 4 - res4 = vstore.similarity_search("[-1,-1]", k=1, filter={"k": "s"}) - assert res4[0].metadata["ord"] == "307.0" - assert res4[0].id == "s" - # delete_by_document_id - vstore.delete_by_document_id("s") - assert len(vstore.similarity_search("[-1,-1]", k=10)) == 3 - - -async def test_cassandra_vectorstore_crud_async() -> None: - """Add/delete/update behaviour, async version.""" - vstore = await _vectorstore_from_documents_async( - docs=[], - embedding=_embedding_d2(), ) + def test_cassandra_vectorstore_from_texts_sync( + self, + *, + embedding_d2: Embeddings, + page_contents: list[str], + ) -> None: + """from_texts methods and the associated warnings.""" + v_store = _vectorstore_from_texts( + texts=page_contents[0:2], + metadatas=[{"m": 1}, {"m": 3}], + embedding=embedding_d2, + ids=["ft1", "ft3"], + ) + search_results_triples_0 = v_store.similarity_search_with_score_id( + page_contents[1], + k=1, + ) + assert len(search_results_triples_0) == 1 + res_doc_0, _, res_id_0 = search_results_triples_0[0] + assert res_doc_0.page_content == page_contents[1] + assert res_doc_0.metadata == {"m": "3.0"} + assert res_id_0 == "ft3" + + _vectorstore_from_texts( + texts=page_contents[2:4], + metadatas=[{"m": 5}, {"m": 7}], + embedding=embedding_d2, + ids=["ft5", "ft7"], + drop=False, + ) - res0 = await vstore.asimilarity_search("[-1,-1]", k=2) - assert res0 == [] - # write and check again - added_ids = await vstore.aadd_texts( - texts=["[1,2]", "[3,4]", "[5,6]"], - metadatas=[ - {"k": "a", "ord": 0}, - {"k": "b", "ord": 1}, - {"k": "c", "ord": 2}, + search_results_triples_1 = v_store.similarity_search_with_score_id( + page_contents[3], + k=1, + ) + assert len(search_results_triples_1) == 1 + res_doc_1, _, res_id_1 = search_results_triples_1[0] + assert res_doc_1.page_content == page_contents[3] + assert res_doc_1.metadata == {"m": "7.0"} + assert res_id_1 == "ft7" + v_store_2 = _vectorstore_from_texts( + texts=page_contents[4:6], + metadatas=[{"m": 9}, {"m": 11}], + embedding=embedding_d2, + ids=["ft9", "ft11"], + drop=False, + ) + search_results_triples_2 = v_store_2.similarity_search_with_score_id( + page_contents[5], + k=1, + ) + assert len(search_results_triples_2) == 1 + res_doc_2, _, res_id_2 = search_results_triples_2[0] + assert res_doc_2.page_content == page_contents[5] + assert res_doc_2.metadata == {"m": "11.0"} + assert res_id_2 == "ft11" + + @pytest.mark.parametrize( + "page_contents", + [ + ["[1,2]", "[3,4]"], ], - ids=["a", "b", "c"], ) - # not requiring ordered match (elsewhere it may be overwriting some) - assert set(added_ids) == {"a", "b", "c"} - res1 = await vstore.asimilarity_search("[-1,-1]", k=5) - assert {doc.page_content for doc in res1} == {"[1,2]", "[3,4]", "[5,6]"} - res2 = await vstore.asimilarity_search("[3,4]", k=1) - assert len(res2) == 1 - assert res2[0].page_content == "[3,4]" - assert res2[0].metadata == {"k": "b", "ord": 1} - assert res2[0].id == "b" - # partial overwrite and count total entries - added_ids_1 = await vstore.aadd_texts( - texts=["[5,6]", "[7,8]"], - metadatas=[ - {"k": "c_new", "ord": 102}, - {"k": "d_new", "ord": 103}, + def test_cassandra_vectorstore_from_documents_sync( + self, + *, + embedding_d2: Embeddings, + page_contents: list[str], + ) -> None: + """from_documents, esp. the various handling of ID-in-doc vs external.""" + pc1, pc2 = page_contents + # no IDs. + v_store = _vectorstore_from_documents( + [ + Document(page_content=pc1, metadata={"m": 1}), + Document(page_content=pc2, metadata={"m": 3}), + ], + embedding=embedding_d2, + ) + hits = v_store.similarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + v_store.clear() + + # IDs passed separately. + with pytest.warns(DeprecationWarning) as rec_warnings: + v_store_2 = _vectorstore_from_documents( + [ + Document(page_content=pc1, metadata={"m": 1}), + Document(page_content=pc2, metadata={"m": 3}), + ], + ids=["idx1", "idx3"], + embedding=embedding_d2, + drop=False, + ) + f_rec_warnings = [ + wrn for wrn in rec_warnings if issubclass(wrn.category, DeprecationWarning) + ] + assert len(f_rec_warnings) == 1 + hits = v_store_2.similarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + assert hits[0].id == "idx3" + v_store_2.clear() + + # IDs in documents. + v_store_3 = _vectorstore_from_documents( + [ + Document(page_content=pc1, metadata={"m": 1}, id="idx1"), + Document(page_content=pc2, metadata={"m": 3}, id="idx3"), + ], + embedding=embedding_d2, + drop=False, + ) + hits = v_store_3.similarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + assert hits[0].id == "idx3" + v_store_3.clear() + + # IDs both in documents and aside. + with pytest.warns(DeprecationWarning) as rec_warnings: + v_store_4 = _vectorstore_from_documents( + [ + Document(page_content=pc1, metadata={"m": 1}), + Document(page_content=pc2, metadata={"m": 3}, id="idy3"), + ], + embedding=embedding_d2, + ids=["idx1", "idx3"], + drop=False, + ) + f_rec_warnings = [ + wrn for wrn in rec_warnings if issubclass(wrn.category, DeprecationWarning) + ] + hits = v_store_4.similarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + assert hits[0].id == "idx3" + v_store_4.clear() + + @pytest.mark.parametrize( + "page_contents", + [ + [ + "[1,2]", + "[3,4]", + "[5,6]", + "[7,8]", + "[9,10]", + "[11,12]", + ], ], - ids=["c", "d"], ) - # not requiring ordered match (elsewhere it may be overwriting some) - assert set(added_ids_1) == {"c", "d"} - res2 = await vstore.asimilarity_search("[-1,-1]", k=10) - assert len(res2) == 4 - # pick one that was just updated and check its metadata - res3 = await vstore.asimilarity_search_with_score_id( - query="[5,6]", k=1, filter={"k": "c_new"} - ) - doc3, _, id3 = res3[0] - assert doc3.page_content == "[5,6]" - assert doc3.metadata == {"k": "c_new", "ord": "102.0"} - assert id3 == "c" - # delete and count again - del1_res = await vstore.adelete(["b"]) - assert del1_res is True - del2_res = await vstore.adelete(["a", "c", "Z!"]) - assert del2_res is True # a non-existing ID was supplied - assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 1 - # clear store - await vstore.aclear() - assert await vstore.asimilarity_search("[-1,-1]", k=2) == [] - # add_documents with "ids" arg passthrough - await vstore.aadd_documents( + async def test_cassandra_vectorstore_from_texts_async( + self, + *, + embedding_d2: Embeddings, + page_contents: list[str], + ) -> None: + """from_texts methods and the associated warnings, async version.""" + v_store = await _vectorstore_from_texts_async( + texts=page_contents[0:2], + metadatas=[{"m": 1}, {"m": 3}], + ids=["ft1", "ft3"], + embedding=embedding_d2, + ) + search_results_triples_0 = await v_store.asimilarity_search_with_score_id( + page_contents[1], + k=1, + ) + assert len(search_results_triples_0) == 1 + res_doc_0, _, res_id_0 = search_results_triples_0[0] + assert res_doc_0.page_content == page_contents[1] + assert res_doc_0.metadata == {"m": "3.0"} + assert res_id_0 == "ft3" + + await _vectorstore_from_texts_async( + texts=page_contents[2:4], + metadatas=[{"m": 5}, {"m": 7}], + ids=["ft5", "ft7"], + embedding=embedding_d2, + drop=False, + ) + search_results_triples_1 = await v_store.asimilarity_search_with_score_id( + page_contents[3], + k=1, + ) + assert len(search_results_triples_1) == 1 + res_doc_1, _, res_id_1 = search_results_triples_1[0] + assert res_doc_1.page_content == page_contents[3] + assert res_doc_1.metadata == {"m": "7.0"} + assert res_id_1 == "ft7" + + v_store_2 = await _vectorstore_from_texts_async( + texts=page_contents[4:6], + metadatas=[{"m": 9}, {"m": 11}], + ids=["ft9", "ft11"], + embedding=embedding_d2, + drop=False, + ) + search_results_triples_2 = await v_store_2.asimilarity_search_with_score_id( + page_contents[5], + k=1, + ) + assert len(search_results_triples_2) == 1 + res_doc_2, _, res_id_2 = search_results_triples_2[0] + assert res_doc_2.page_content == page_contents[5] + assert res_doc_2.metadata == {"m": "11.0"} + assert res_id_2 == "ft11" + + @pytest.mark.parametrize( + "page_contents", [ - Document(page_content="[9,10]", metadata={"k": "v", "ord": 204}), - Document(page_content="[11,12]", metadata={"k": "w", "ord": 205}), + ["[1,2]", "[3,4]"], ], - ids=["v", "w"], - ) - assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 2 - res4 = await vstore.asimilarity_search("[11,12]", k=1, filter={"k": "w"}) - assert res4[0].metadata["ord"] == "205.0" - assert res4[0].id == "w" - # add_texts with "ids" arg passthrough - await vstore.aadd_texts( - texts=["[13,14]", "[15,16]"], - metadatas=[{"k": "r", "ord": 306}, {"k": "s", "ord": 307}], - ids=["r", "s"], - ) - assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 4 - res4 = await vstore.asimilarity_search("[-1,-1]", k=1, filter={"k": "s"}) - assert res4[0].metadata["ord"] == "307.0" - assert res4[0].id == "s" - # delete_by_document_id - await vstore.adelete_by_document_id("s") - assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 3 - - -def test_cassandra_vectorstore_massive_insert_replace_sync() -> None: - """Testing the insert-many-and-replace-some patterns thoroughly.""" - vector_store_d2 = _vectorstore_from_documents( - docs=[], - embedding=_embedding_d2(), - ) - - full_size = 300 - first_group_size = 150 - second_group_slicer = [30, 100, 2] - - all_ids = [f"doc_{idx}" for idx in range(full_size)] - all_texts = [f"[0,{idx + 1}]" for idx in range(full_size)] - - # massive insertion on empty - group0_ids = all_ids[0:first_group_size] - group0_texts = all_texts[0:first_group_size] - inserted_ids0 = vector_store_d2.add_texts( - texts=group0_texts, - ids=group0_ids, - ) - assert set(inserted_ids0) == set(group0_ids) - # massive insertion with many overwrites scattered through - # (we change the text to later check on DB for successful update) - _s, _e, _st = second_group_slicer - group1_ids = all_ids[_s:_e:_st] + all_ids[first_group_size:full_size] - group1_texts = [ - txt.upper() - for txt in (all_texts[_s:_e:_st] + all_texts[first_group_size:full_size]) - ] - inserted_ids1 = vector_store_d2.add_texts( - texts=group1_texts, - ids=group1_ids, - ) - assert set(inserted_ids1) == set(group1_ids) - # final read (we want the IDs to do a full check) - expected_text_by_id = { - **dict(zip(group0_ids, group0_texts)), - **dict(zip(group1_ids, group1_texts)), - } - full_results = vector_store_d2.similarity_search_with_score_id_by_vector( - embedding=[1.0, 1.0], - k=full_size, - ) - for doc, _, doc_id in full_results: - assert doc.page_content == expected_text_by_id[doc_id] - - -async def test_cassandra_vectorstore_massive_insert_replace_async() -> None: - """ - Testing the insert-many-and-replace-some patterns thoroughly. - Async version. - """ - vector_store_d2 = await _vectorstore_from_documents_async( - docs=[], - embedding=_embedding_d2(), ) - - full_size = 300 - first_group_size = 150 - second_group_slicer = [30, 100, 2] - - all_ids = [f"doc_{idx}" for idx in range(full_size)] - all_texts = [f"[0,{idx + 1}]" for idx in range(full_size)] - all_embeddings = [[0, idx + 1] for idx in range(full_size)] - - # massive insertion on empty - group0_ids = all_ids[0:first_group_size] - group0_texts = all_texts[0:first_group_size] - - inserted_ids0 = await vector_store_d2.aadd_texts( - texts=group0_texts, - ids=group0_ids, - ) - assert set(inserted_ids0) == set(group0_ids) - # massive insertion with many overwrites scattered through - # (we change the text to later check on DB for successful update) - _s, _e, _st = second_group_slicer - group1_ids = all_ids[_s:_e:_st] + all_ids[first_group_size:full_size] - group1_texts = [ - txt.upper() - for txt in (all_texts[_s:_e:_st] + all_texts[first_group_size:full_size]) - ] - inserted_ids1 = await vector_store_d2.aadd_texts( - texts=group1_texts, - ids=group1_ids, - ) - assert set(inserted_ids1) == set(group1_ids) - # final read (we want the IDs to do a full check) - expected_text_by_id = dict(zip(all_ids, all_texts)) - full_results = await vector_store_d2.asimilarity_search_with_score_id_by_vector( - embedding=[1.0, 1.0], - k=full_size, - ) - for doc, _, doc_id in full_results: - assert doc.page_content == expected_text_by_id[doc_id] - expected_embedding_by_id = dict(zip(all_ids, all_embeddings)) - full_results_with_embeddings = ( - await vector_store_d2.asimilarity_search_with_embedding_id_by_vector( + async def test_cassandra_vectorstore_from_documents_async( + self, + *, + embedding_d2: Embeddings, + page_contents: list[str], + ) -> None: + """ + from_documents, esp. the various handling of ID-in-doc vs external. + Async version. + """ + pc1, pc2 = page_contents + + # no IDs. + v_store_1 = await _vectorstore_from_documents_async( + [ + Document(page_content=pc1, metadata={"m": 1}), + Document(page_content=pc2, metadata={"m": 3}), + ], + embedding=embedding_d2, + ) + hits = await v_store_1.asimilarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + await v_store_1.aclear() + + # IDs passed separately. + with pytest.warns(DeprecationWarning) as rec_warnings: + v_store_2 = await _vectorstore_from_documents_async( + [ + Document(page_content=pc1, metadata={"m": 1}), + Document(page_content=pc2, metadata={"m": 3}), + ], + embedding=embedding_d2, + drop=False, + ids=["idx1", "idx3"], + ) + f_rec_warnings = [ + wrn for wrn in rec_warnings if issubclass(wrn.category, DeprecationWarning) + ] + assert len(f_rec_warnings) == 1 + hits = await v_store_2.asimilarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + assert hits[0].id == "idx3" + await v_store_2.aclear() + + # IDs in documents. + + v_store_3 = await _vectorstore_from_documents_async( + [ + Document(page_content=pc1, metadata={"m": 1}, id="idx1"), + Document(page_content=pc2, metadata={"m": 3}, id="idx3"), + ], + embedding=embedding_d2, + drop=False, + ) + hits = await v_store_3.asimilarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + assert hits[0].id == "idx3" + await v_store_3.aclear() + + # IDs both in documents and aside. + with pytest.warns(DeprecationWarning) as rec_warnings: + v_store_4 = await _vectorstore_from_documents_async( + [ + Document(page_content=pc1, metadata={"m": 1}), + Document(page_content=pc2, metadata={"m": 3}, id="idy3"), + ], + embedding=embedding_d2, + ids=["idx1", "idx3"], + drop=False, + ) + f_rec_warnings = [ + wrn for wrn in rec_warnings if issubclass(wrn.category, DeprecationWarning) + ] + assert len(f_rec_warnings) == 1 + hits = await v_store_4.asimilarity_search(pc2, k=1) + assert len(hits) == 1 + assert hits[0].page_content == pc2 + assert hits[0].metadata == {"m": "3.0"} + assert hits[0].id == "idx3" + await v_store_4.aclear() + + def test_cassandra_vectorstore_crud_sync( + self, + vector_store_d2: Cassandra, + ) -> None: + """Add/delete/update behaviour.""" + vstore = vector_store_d2 + + res0 = vstore.similarity_search("[-1,-1]", k=2) + assert res0 == [] + # write and check again + added_ids = vstore.add_texts( + texts=["[1,2]", "[3,4]", "[5,6]"], + metadatas=[ + {"k": "a", "ord": 0}, + {"k": "b", "ord": 1}, + {"k": "c", "ord": 2}, + ], + ids=["a", "b", "c"], + ) + # not requiring ordered match (elsewhere it may be overwriting some) + assert set(added_ids) == {"a", "b", "c"} + res1 = vstore.similarity_search("[-1,-1]", k=5) + assert {doc.page_content for doc in res1} == {"[1,2]", "[3,4]", "[5,6]"} + res2 = vstore.similarity_search("[3,4]", k=1) + assert len(res2) == 1 + assert res2[0].page_content == "[3,4]" + assert res2[0].metadata == {"k": "b", "ord": "1.0"} + assert res2[0].id == "b" + # partial overwrite and count total entries + added_ids_1 = vstore.add_texts( + texts=["[5,6]", "[7,8]"], + metadatas=[ + {"k": "c_new", "ord": 102}, + {"k": "d_new", "ord": 103}, + ], + ids=["c", "d"], + ) + # not requiring ordered match (elsewhere it may be overwriting some) + assert set(added_ids_1) == {"c", "d"} + res2 = vstore.similarity_search("[-1,-1]", k=10) + assert len(res2) == 4 + # pick one that was just updated and check its metadata + res3 = vstore.similarity_search_with_score_id( + query="[5,6]", k=1, filter={"k": "c_new"} + ) + doc3, _, id3 = res3[0] + assert doc3.page_content == "[5,6]" + assert doc3.metadata == {"k": "c_new", "ord": "102.0"} + assert id3 == "c" + # delete and count again + del1_res = vstore.delete(["b"]) + assert del1_res is True + del2_res = vstore.delete(["a", "c", "Z!"]) + assert del2_res is True # a non-existing ID was supplied + assert len(vstore.similarity_search("[-1,-1]", k=10)) == 1 + # clear store + vstore.clear() + assert vstore.similarity_search("[-1,-1]", k=2) == [] + # add_documents with "ids" arg passthrough + vstore.add_documents( + [ + Document(page_content="[9,10]", metadata={"k": "v", "ord": 204}), + Document(page_content="[11,12]", metadata={"k": "w", "ord": 205}), + ], + ids=["v", "w"], + ) + assert len(vstore.similarity_search("[-1,-1]", k=10)) == 2 + res4 = vstore.similarity_search("[11,12]", k=1, filter={"k": "w"}) + assert res4[0].metadata["ord"] == "205.0" + assert res4[0].id == "w" + # add_texts with "ids" arg passthrough + vstore.add_texts( + texts=["[13,14]", "[15,16]"], + metadatas=[{"k": "r", "ord": 306}, {"k": "s", "ord": 307}], + ids=["r", "s"], + ) + assert len(vstore.similarity_search("[-1,-1]", k=10)) == 4 + res4 = vstore.similarity_search("[-1,-1]", k=1, filter={"k": "s"}) + assert res4[0].metadata["ord"] == "307.0" + assert res4[0].id == "s" + # delete_by_document_id + vstore.delete_by_document_id("s") + assert len(vstore.similarity_search("[-1,-1]", k=10)) == 3 + + async def test_cassandra_vectorstore_crud_async( + self, + vector_store_d2: Cassandra, + ) -> None: + """Add/delete/update behaviour, async version.""" + vstore = vector_store_d2 + + res0 = await vstore.asimilarity_search("[-1,-1]", k=2) + assert res0 == [] + # write and check again + added_ids = await vstore.aadd_texts( + texts=["[1,2]", "[3,4]", "[5,6]"], + metadatas=[ + {"k": "a", "ord": 0}, + {"k": "b", "ord": 1}, + {"k": "c", "ord": 2}, + ], + ids=["a", "b", "c"], + ) + # not requiring ordered match (elsewhere it may be overwriting some) + assert set(added_ids) == {"a", "b", "c"} + res1 = await vstore.asimilarity_search("[-1,-1]", k=5) + assert {doc.page_content for doc in res1} == {"[1,2]", "[3,4]", "[5,6]"} + res2 = await vstore.asimilarity_search("[3,4]", k=1) + assert len(res2) == 1 + assert res2[0].page_content == "[3,4]" + assert res2[0].metadata == {"k": "b", "ord": "1.0"} + assert res2[0].id == "b" + # partial overwrite and count total entries + added_ids_1 = await vstore.aadd_texts( + texts=["[5,6]", "[7,8]"], + metadatas=[ + {"k": "c_new", "ord": 102}, + {"k": "d_new", "ord": 103}, + ], + ids=["c", "d"], + ) + # not requiring ordered match (elsewhere it may be overwriting some) + assert set(added_ids_1) == {"c", "d"} + res2 = await vstore.asimilarity_search("[-1,-1]", k=10) + assert len(res2) == 4 + # pick one that was just updated and check its metadata + res3 = await vstore.asimilarity_search_with_score_id( + query="[5,6]", k=1, filter={"k": "c_new"} + ) + doc3, _, id3 = res3[0] + assert doc3.page_content == "[5,6]" + assert doc3.metadata == {"k": "c_new", "ord": "102.0"} + assert id3 == "c" + # delete and count again + del1_res = await vstore.adelete(["b"]) + assert del1_res is True + del2_res = await vstore.adelete(["a", "c", "Z!"]) + assert del2_res is True # a non-existing ID was supplied + assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 1 + # clear store + await vstore.aclear() + assert await vstore.asimilarity_search("[-1,-1]", k=2) == [] + # add_documents with "ids" arg passthrough + await vstore.aadd_documents( + [ + Document(page_content="[9,10]", metadata={"k": "v", "ord": 204}), + Document(page_content="[11,12]", metadata={"k": "w", "ord": 205}), + ], + ids=["v", "w"], + ) + assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 2 + res4 = await vstore.asimilarity_search("[11,12]", k=1, filter={"k": "w"}) + assert res4[0].metadata["ord"] == "205.0" + assert res4[0].id == "w" + # add_texts with "ids" arg passthrough + await vstore.aadd_texts( + texts=["[13,14]", "[15,16]"], + metadatas=[{"k": "r", "ord": 306}, {"k": "s", "ord": 307}], + ids=["r", "s"], + ) + assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 4 + res4 = await vstore.asimilarity_search("[-1,-1]", k=1, filter={"k": "s"}) + assert res4[0].metadata["ord"] == "307.0" + assert res4[0].id == "s" + # delete_by_document_id + await vstore.adelete_by_document_id("s") + assert len(await vstore.asimilarity_search("[-1,-1]", k=10)) == 3 + + def test_cassandra_vectorstore_massive_insert_replace_sync( + self, + vector_store_d2: Cassandra, + ) -> None: + """Testing the insert-many-and-replace-some patterns thoroughly.""" + full_size = 300 + first_group_size = 150 + second_group_slicer = [30, 100, 2] + + all_ids = [f"doc_{idx}" for idx in range(full_size)] + all_texts = [f"[0,{idx + 1}]" for idx in range(full_size)] + + # massive insertion on empty + group0_ids = all_ids[0:first_group_size] + group0_texts = all_texts[0:first_group_size] + inserted_ids0 = vector_store_d2.add_texts( + texts=group0_texts, + ids=group0_ids, + ) + assert set(inserted_ids0) == set(group0_ids) + # massive insertion with many overwrites scattered through + # (we change the text to later check on DB for successful update) + _s, _e, _st = second_group_slicer + group1_ids = all_ids[_s:_e:_st] + all_ids[first_group_size:full_size] + group1_texts = [ + txt.upper() + for txt in (all_texts[_s:_e:_st] + all_texts[first_group_size:full_size]) + ] + inserted_ids1 = vector_store_d2.add_texts( + texts=group1_texts, + ids=group1_ids, + ) + assert set(inserted_ids1) == set(group1_ids) + # final read (we want the IDs to do a full check) + expected_text_by_id = { + **dict(zip(group0_ids, group0_texts)), + **dict(zip(group1_ids, group1_texts)), + } + full_results = vector_store_d2.similarity_search_with_score_id_by_vector( embedding=[1.0, 1.0], k=full_size, ) - ) - for doc, embedding, doc_id in full_results_with_embeddings: - assert doc.page_content == expected_text_by_id[doc_id] - assert embedding == expected_embedding_by_id[doc_id] - - -def test_cassandra_vectorstore_delete_by_metadata_sync() -> None: - """Testing delete_by_metadata_filter.""" - vector_store_d2 = _vectorstore_from_documents( - docs=[], - embedding=_embedding_d2(), - ) - - full_size = 400 - # one in ... will be deleted - deletee_ratio = 3 - - documents = [ - Document(page_content="[1,1]", metadata={"deletee": doc_i % deletee_ratio == 0}) - for doc_i in range(full_size) - ] - num_deletees = len([doc for doc in documents if doc.metadata["deletee"]]) - - inserted_ids0 = vector_store_d2.add_documents(documents) - assert len(inserted_ids0) == len(documents) - - d_result0 = vector_store_d2.delete_by_metadata_filter({"deletee": True}) - assert d_result0 == num_deletees - count_on_store0 = len(vector_store_d2.similarity_search("[1,1]", k=full_size + 1)) - assert count_on_store0 == full_size - num_deletees - - with pytest.raises(ValueError, match="does not accept an empty"): - vector_store_d2.delete_by_metadata_filter({}) - count_on_store1 = len(vector_store_d2.similarity_search("[1,1]", k=full_size + 1)) - assert count_on_store1 == full_size - num_deletees - - -async def test_cassandra_vectorstore_delete_by_metadata_async() -> None: - """Testing delete_by_metadata_filter, async version.""" - vector_store_d2 = await _vectorstore_from_documents_async( - docs=[], - embedding=_embedding_d2(), - ) - full_size = 400 - # one in ... will be deleted - deletee_ratio = 3 - - documents = [ - Document(page_content="[1,1]", metadata={"deletee": doc_i % deletee_ratio == 0}) - for doc_i in range(full_size) - ] - num_deletees = len([doc for doc in documents if doc.metadata["deletee"]]) - - inserted_ids0 = await vector_store_d2.aadd_documents(documents) - assert len(inserted_ids0) == len(documents) - - d_result0 = await vector_store_d2.adelete_by_metadata_filter({"deletee": True}) - assert d_result0 == num_deletees - count_on_store0 = len( - await vector_store_d2.asimilarity_search("[1,1]", k=full_size + 1) - ) - assert count_on_store0 == full_size - num_deletees - - with pytest.raises(ValueError, match="does not accept an empty"): - await vector_store_d2.adelete_by_metadata_filter({}) - count_on_store1 = len( - await vector_store_d2.asimilarity_search("[1,1]", k=full_size + 1) - ) - assert count_on_store1 == full_size - num_deletees - - -def test_cassandra_replace_metadata() -> None: - """Test of replacing metadata.""" - N_DOCS = 100 - REPLACE_RATIO = 2 # one in ... will have replaced metadata - BATCH_SIZE = 3 - - vstore_f1 = _vectorstore_from_texts( - texts=[], - metadata_indexing=("allowlist", ["field1", "field2"]), - table_name="vector_test_table_indexing", - embedding=ConsistentFakeEmbeddings(), - ) - orig_documents = [ - Document( - page_content=f"doc_{doc_i}", - id=f"doc_id_{doc_i}", - metadata={"field1": f"f1_{doc_i}", "otherf": "pre"}, + for doc, _, doc_id in full_results: + assert doc.page_content == expected_text_by_id[doc_id] + + async def test_cassandra_vectorstore_massive_insert_replace_async( + self, + vector_store_d2: Cassandra, + ) -> None: + """ + Testing the insert-many-and-replace-some patterns thoroughly. + Async version. + """ + full_size = 300 + first_group_size = 150 + second_group_slicer = [30, 100, 2] + + all_ids = [f"doc_{idx}" for idx in range(full_size)] + all_texts = [f"[0,{idx + 1}]" for idx in range(full_size)] + all_embeddings = [[0, idx + 1] for idx in range(full_size)] + + # massive insertion on empty + group0_ids = all_ids[0:first_group_size] + group0_texts = all_texts[0:first_group_size] + + inserted_ids0 = await vector_store_d2.aadd_texts( + texts=group0_texts, + ids=group0_ids, ) - for doc_i in range(N_DOCS) - ] - vstore_f1.add_documents(orig_documents) - - ids_to_replace = [ - f"doc_id_{doc_i}" for doc_i in range(N_DOCS) if doc_i % REPLACE_RATIO == 0 - ] - - # various kinds of replacement at play here: - def _make_new_md(mode: int, doc_id: str) -> dict[str, str]: - if mode == 0: - return {} - elif mode == 1: - return {"field2": f"NEW_{doc_id}"} - elif mode == 2: - return {"field2": f"NEW_{doc_id}", "ofherf2": "post"} - else: - return {"ofherf2": "post"} - - ids_to_new_md = { - doc_id: _make_new_md(rep_i % 4, doc_id) - for rep_i, doc_id in enumerate(ids_to_replace) - } - - vstore_f1.replace_metadata(ids_to_new_md, batch_size=BATCH_SIZE) - # thorough check - expected_id_to_metadata: dict[str, dict] = { - **{(document.id or ""): document.metadata for document in orig_documents}, - **ids_to_new_md, - } - for hit in vstore_f1.similarity_search("doc", k=N_DOCS + 1): - assert hit.id is not None - assert hit.metadata == expected_id_to_metadata[hit.id] - - -async def test_cassandra_replace_metadata_async() -> None: - """Test of replacing metadata.""" - N_DOCS = 100 - REPLACE_RATIO = 2 # one in ... will have replaced metadata - BATCH_SIZE = 3 - - vstore_f1 = _vectorstore_from_texts( - texts=[], - metadata_indexing=("allowlist", ["field1", "field2"]), - table_name="vector_test_table_indexing", - embedding=ConsistentFakeEmbeddings(), - ) - orig_documents = [ - Document( - page_content=f"doc_{doc_i}", - id=f"doc_id_{doc_i}", - metadata={"field1": f"f1_{doc_i}", "otherf": "pre"}, + assert set(inserted_ids0) == set(group0_ids) + # massive insertion with many overwrites scattered through + # (we change the text to later check on DB for successful update) + _s, _e, _st = second_group_slicer + group1_ids = all_ids[_s:_e:_st] + all_ids[first_group_size:full_size] + group1_texts = [ + txt.upper() + for txt in (all_texts[_s:_e:_st] + all_texts[first_group_size:full_size]) + ] + inserted_ids1 = await vector_store_d2.aadd_texts( + texts=group1_texts, + ids=group1_ids, ) - for doc_i in range(N_DOCS) - ] - await vstore_f1.aadd_documents(orig_documents) - - ids_to_replace = [ - f"doc_id_{doc_i}" for doc_i in range(N_DOCS) if doc_i % REPLACE_RATIO == 0 - ] - - # various kinds of replacement at play here: - def _make_new_md(mode: int, doc_id: str) -> dict[str, str]: - if mode == 0: - return {} - elif mode == 1: - return {"field2": f"NEW_{doc_id}"} - elif mode == 2: - return {"field2": f"NEW_{doc_id}", "ofherf2": "post"} - else: - return {"ofherf2": "post"} - - ids_to_new_md = { - doc_id: _make_new_md(rep_i % 4, doc_id) - for rep_i, doc_id in enumerate(ids_to_replace) - } - - await vstore_f1.areplace_metadata(ids_to_new_md, concurrency=BATCH_SIZE) - # thorough check - expected_id_to_metadata: dict[str, dict] = { - **{(document.id or ""): document.metadata for document in orig_documents}, - **ids_to_new_md, - } - for hit in await vstore_f1.asimilarity_search("doc", k=N_DOCS + 1): - assert hit.id is not None - assert hit.metadata == expected_id_to_metadata[hit.id] - - -def test_cassandra_vectorstore_mmr_sync() -> None: - """MMR testing. We work on the unit circle with angle multiples - of 2*pi/20 and prepare a store with known vectors for a controlled - MMR outcome. - """ - vector_store_d2 = _vectorstore_from_documents( - docs=[], - embedding=_embedding_d2(), - ) - - def _v_from_i(i: int, n: int) -> str: - angle = 2 * math.pi * i / n - vector = [math.cos(angle), math.sin(angle)] - return json.dumps(vector) - - i_vals = [0, 4, 5, 13] - n_val = 20 - vector_store_d2.add_texts( - [_v_from_i(i, n_val) for i in i_vals], metadatas=[{"i": i} for i in i_vals] - ) - res1 = vector_store_d2.max_marginal_relevance_search( - _v_from_i(3, n_val), - k=2, - fetch_k=3, - ) - res_i_vals = {doc.metadata["i"] for doc in res1} - assert res_i_vals == {"0.0", "4.0"} - - -async def test_cassandra_vectorstore_mmr_async() -> None: - """MMR testing. We work on the unit circle with angle multiples - of 2*pi/20 and prepare a store with known vectors for a controlled - MMR outcome. - Async version. - """ - vector_store_d2 = await _vectorstore_from_documents_async( - docs=[], - embedding=_embedding_d2(), - ) - - def _v_from_i(i: int, n: int) -> str: - angle = 2 * math.pi * i / n - vector = [math.cos(angle), math.sin(angle)] - return json.dumps(vector) - - i_vals = [0, 4, 5, 13] - n_val = 20 - await vector_store_d2.aadd_texts( - [_v_from_i(i, n_val) for i in i_vals], - metadatas=[{"i": i} for i in i_vals], - ) - res1 = await vector_store_d2.amax_marginal_relevance_search( - _v_from_i(3, n_val), - k=2, - fetch_k=3, - ) - res_i_vals = {doc.metadata["i"] for doc in res1} - assert res_i_vals == {"0.0", "4.0"} - - -def test_cassandra_vectorstore_metadata_filter() -> None: - """Metadata filtering.""" - vstore = _vectorstore_from_documents( - docs=_metadata_documents(), - embedding=_embedding_d2(), - ) - # no filters - res0 = vstore.similarity_search("[-1,-1]", k=10) - assert {doc.metadata["letter"] for doc in res0} == set("qwreio") - # single filter - res1 = vstore.similarity_search( - "[-1,-1]", - k=10, - filter={"group": "vowel"}, - ) - assert {doc.metadata["letter"] for doc in res1} == set("eio") - # multiple filters - res2 = vstore.similarity_search( - "[-1,-1]", - k=10, - filter={"group": "consonant", "ord": str(ord("q"))}, - ) - assert {doc.metadata["letter"] for doc in res2} == set("q") - # excessive filters - res3 = vstore.similarity_search( - "[-1,-1]", - k=10, - filter={"group": "consonant", "ord": str(ord("q")), "case": "upper"}, - ) - assert res3 == [] - + assert set(inserted_ids1) == set(group1_ids) + # final read (we want the IDs to do a full check) + expected_text_by_id = dict(zip(all_ids, all_texts)) + full_results = await vector_store_d2.asimilarity_search_with_score_id_by_vector( + embedding=[1.0, 1.0], + k=full_size, + ) + for doc, _, doc_id in full_results: + assert doc.page_content == expected_text_by_id[doc_id] + expected_embedding_by_id = dict(zip(all_ids, all_embeddings)) + full_results_with_embeddings = ( + await vector_store_d2.asimilarity_search_with_embedding_id_by_vector( + embedding=[1.0, 1.0], + k=full_size, + ) + ) + for doc, embedding, doc_id in full_results_with_embeddings: + assert doc.page_content == expected_text_by_id[doc_id] + assert embedding == expected_embedding_by_id[doc_id] + + def test_cassandra_vectorstore_delete_by_metadata_sync( + self, + vector_store_d2: Cassandra, + ) -> None: + """Testing delete_by_metadata_filter.""" + full_size = 400 + # one in ... will be deleted + deletee_ratio = 3 + + documents = [ + Document( + page_content="[1,1]", metadata={"deletee": doc_i % deletee_ratio == 0} + ) + for doc_i in range(full_size) + ] + num_deletees = len([doc for doc in documents if doc.metadata["deletee"]]) -def test_cassandra_vectorstore_metadata_search_sync() -> None: - """Metadata Search""" - vstore = _vectorstore_from_documents( - docs=_metadata_documents(), - embedding=_embedding_d2(), - ) - # no filters - res0 = vstore.metadata_search(filter={}, n=10) - assert {doc.metadata["letter"] for doc in res0} == set("qwreio") - # single filter - res1 = vstore.metadata_search( - n=10, - filter={"group": "vowel"}, - ) - assert {doc.metadata["letter"] for doc in res1} == set("eio") - # multiple filters - res2 = vstore.metadata_search( - n=10, - filter={"group": "consonant", "ord": str(ord("q"))}, - ) - assert {doc.metadata["letter"] for doc in res2} == set("q") - # excessive filters - res3 = vstore.metadata_search( - n=10, - filter={"group": "consonant", "ord": str(ord("q")), "case": "upper"}, - ) - assert res3 == [] + inserted_ids0 = vector_store_d2.add_documents(documents) + assert len(inserted_ids0) == len(documents) + d_result0 = vector_store_d2.delete_by_metadata_filter({"deletee": True}) + assert d_result0 == num_deletees + count_on_store0 = len( + vector_store_d2.similarity_search("[1,1]", k=full_size + 1) + ) + assert count_on_store0 == full_size - num_deletees -async def test_cassandra_vectorstore_metadata_search_async() -> None: - """Metadata Search""" - vstore = await _vectorstore_from_documents_async( - docs=_metadata_documents(), - embedding=_embedding_d2(), - ) + with pytest.raises(ValueError, match="does not accept an empty"): + vector_store_d2.delete_by_metadata_filter({}) + count_on_store1 = len( + vector_store_d2.similarity_search("[1,1]", k=full_size + 1) + ) + assert count_on_store1 == full_size - num_deletees + + async def test_cassandra_vectorstore_delete_by_metadata_async( + self, + vector_store_d2: Cassandra, + ) -> None: + """Testing delete_by_metadata_filter, async version.""" + full_size = 400 + # one in ... will be deleted + deletee_ratio = 3 + + documents = [ + Document( + page_content="[1,1]", metadata={"deletee": doc_i % deletee_ratio == 0} + ) + for doc_i in range(full_size) + ] + num_deletees = len([doc for doc in documents if doc.metadata["deletee"]]) - # no filters - res0 = await vstore.ametadata_search(filter={}, n=10) - assert {doc.metadata["letter"] for doc in res0} == set("qwreio") - # single filter - res1 = vstore.metadata_search( - n=10, - filter={"group": "vowel"}, - ) - assert {doc.metadata["letter"] for doc in res1} == set("eio") - # multiple filters - res2 = await vstore.ametadata_search( - n=10, - filter={"group": "consonant", "ord": str(ord("q"))}, - ) - assert {doc.metadata["letter"] for doc in res2} == set("q") - # excessive filters - res3 = await vstore.ametadata_search( - n=10, - filter={"group": "consonant", "ord": str(ord("q")), "case": "upper"}, - ) - assert res3 == [] + inserted_ids0 = await vector_store_d2.aadd_documents(documents) + assert len(inserted_ids0) == len(documents) + d_result0 = await vector_store_d2.adelete_by_metadata_filter({"deletee": True}) + assert d_result0 == num_deletees + count_on_store0 = len( + await vector_store_d2.asimilarity_search("[1,1]", k=full_size + 1) + ) + assert count_on_store0 == full_size - num_deletees -def test_cassandra_vectorstore_get_by_document_id_sync() -> None: - """Get by document_id""" - vstore = _vectorstore_from_documents( - docs=_metadata_documents(), - embedding=_embedding_d2(), - ) - # invalid id - invalid = vstore.get_by_document_id(document_id="z") - assert invalid is None - # valid id - valid = vstore.get_by_document_id(document_id="q") - assert isinstance(valid, Document) - assert valid.id == "q" - assert valid.page_content == "[1,2]" - assert valid.metadata["group"] == "consonant" - assert valid.metadata["letter"] == "q" - - -async def test_cassandra_vectorstore_get_by_document_id_async() -> None: - """Get by document_id""" - vstore = await _vectorstore_from_documents_async( - docs=_metadata_documents(), - embedding=_embedding_d2(), - ) - # invalid id - invalid = await vstore.aget_by_document_id(document_id="z") - assert invalid is None - # valid id - valid = await vstore.aget_by_document_id(document_id="q") - assert isinstance(valid, Document) - assert valid.id == "q" - assert valid.page_content == "[1,2]" - assert valid.metadata["group"] == "consonant" - assert valid.metadata["letter"] == "q" - - -def test_cassandra_vectorstore_similarity_scale_sync() -> None: - """Scale of the similarity scores.""" - vstore = _vectorstore_from_texts( - texts=["[1,1]", "[-1,-1]"], - ids=["near", "far"], - embedding=_embedding_d2(), - ) - query = "[0.99999,1.00001]" + with pytest.raises(ValueError, match="does not accept an empty"): + await vector_store_d2.adelete_by_metadata_filter({}) + count_on_store1 = len( + await vector_store_d2.asimilarity_search("[1,1]", k=full_size + 1) + ) + assert count_on_store1 == full_size - num_deletees + + def test_cassandra_replace_metadata(self) -> None: + """Test of replacing metadata.""" + N_DOCS = 100 + REPLACE_RATIO = 2 # one in ... will have replaced metadata + BATCH_SIZE = 3 + + vstore_f1 = _vectorstore_from_texts( + texts=[], + metadata_indexing=("allowlist", ["field1", "field2"]), + table_name="vector_test_table_indexing", + embedding=ConsistentFakeEmbeddings(), + ) + orig_documents = [ + Document( + page_content=f"doc_{doc_i}", + id=f"doc_id_{doc_i}", + metadata={"field1": f"f1_{doc_i}", "otherf": "pre"}, + ) + for doc_i in range(N_DOCS) + ] + vstore_f1.add_documents(orig_documents) - res1 = vstore.similarity_search_with_score( - query, - k=2, - ) - scores = [sco for _, sco in res1] - sco_near, sco_far = scores - assert sco_far >= 0 - assert abs(1 - sco_near) < MATCH_EPSILON - assert sco_far < EUCLIDEAN_MIN_SIM_UNIT_VECTORS + MATCH_EPSILON + ids_to_replace = [ + f"doc_id_{doc_i}" for doc_i in range(N_DOCS) if doc_i % REPLACE_RATIO == 0 + ] + # various kinds of replacement at play here: + def _make_new_md(mode: int, doc_id: str) -> dict[str, str]: + if mode == 0: + return {} + elif mode == 1: + return {"field2": f"NEW_{doc_id}"} + elif mode == 2: + return {"field2": f"NEW_{doc_id}", "ofherf2": "post"} + else: + return {"ofherf2": "post"} + + ids_to_new_md = { + doc_id: _make_new_md(rep_i % 4, doc_id) + for rep_i, doc_id in enumerate(ids_to_replace) + } + + vstore_f1.replace_metadata(ids_to_new_md, batch_size=BATCH_SIZE) + # thorough check + expected_id_to_metadata: dict[str, dict] = { + **{(document.id or ""): document.metadata for document in orig_documents}, + **ids_to_new_md, + } + for hit in vstore_f1.similarity_search("doc", k=N_DOCS + 1): + assert hit.id is not None + assert hit.metadata == expected_id_to_metadata[hit.id] + + async def test_cassandra_replace_metadata_async(self) -> None: + """Test of replacing metadata.""" + N_DOCS = 100 + REPLACE_RATIO = 2 # one in ... will have replaced metadata + BATCH_SIZE = 3 + + vstore_f1 = _vectorstore_from_texts( + texts=[], + metadata_indexing=("allowlist", ["field1", "field2"]), + table_name="vector_test_table_indexing", + embedding=ConsistentFakeEmbeddings(), + ) + orig_documents = [ + Document( + page_content=f"doc_{doc_i}", + id=f"doc_id_{doc_i}", + metadata={"field1": f"f1_{doc_i}", "otherf": "pre"}, + ) + for doc_i in range(N_DOCS) + ] + await vstore_f1.aadd_documents(orig_documents) -async def test_cassandra_vectorstore_similarity_scale_async() -> None: - """Scale of the similarity scores, async version.""" - vstore = await _vectorstore_from_texts_async( - texts=["[1,1]", "[-1,-1]"], - ids=["near", "far"], - embedding=_embedding_d2(), - ) + ids_to_replace = [ + f"doc_id_{doc_i}" for doc_i in range(N_DOCS) if doc_i % REPLACE_RATIO == 0 + ] - query = "[0.99999,1.00001]" - res1 = await vstore.asimilarity_search_with_score( - query, - k=2, + # various kinds of replacement at play here: + def _make_new_md(mode: int, doc_id: str) -> dict[str, str]: + if mode == 0: + return {} + elif mode == 1: + return {"field2": f"NEW_{doc_id}"} + elif mode == 2: + return {"field2": f"NEW_{doc_id}", "ofherf2": "post"} + else: + return {"ofherf2": "post"} + + ids_to_new_md = { + doc_id: _make_new_md(rep_i % 4, doc_id) + for rep_i, doc_id in enumerate(ids_to_replace) + } + + await vstore_f1.areplace_metadata(ids_to_new_md, concurrency=BATCH_SIZE) + # thorough check + expected_id_to_metadata: dict[str, dict] = { + **{(document.id or ""): document.metadata for document in orig_documents}, + **ids_to_new_md, + } + for hit in await vstore_f1.asimilarity_search("doc", k=N_DOCS + 1): + assert hit.id is not None + assert hit.metadata == expected_id_to_metadata[hit.id] + + def test_cassandra_vectorstore_mmr_sync( + self, + vector_store_d2: Cassandra, + ) -> None: + """MMR testing. We work on the unit circle with angle multiples + of 2*pi/20 and prepare a store with known vectors for a controlled + MMR outcome. + """ + + def _v_from_i(i: int, n: int) -> str: + angle = 2 * math.pi * i / n + vector = [math.cos(angle), math.sin(angle)] + return json.dumps(vector) + + i_vals = [0, 4, 5, 13] + n_val = 20 + vector_store_d2.add_texts( + [_v_from_i(i, n_val) for i in i_vals], metadatas=[{"i": i} for i in i_vals] + ) + res1 = vector_store_d2.max_marginal_relevance_search( + _v_from_i(3, n_val), + k=2, + fetch_k=3, + ) + res_i_vals = {doc.metadata["i"] for doc in res1} + assert res_i_vals == {"0.0", "4.0"} + + async def test_cassandra_vectorstore_mmr_async( + self, + vector_store_d2: Cassandra, + ) -> None: + """MMR testing. We work on the unit circle with angle multiples + of 2*pi/20 and prepare a store with known vectors for a controlled + MMR outcome. + Async version. + """ + + def _v_from_i(i: int, n: int) -> str: + angle = 2 * math.pi * i / n + vector = [math.cos(angle), math.sin(angle)] + return json.dumps(vector) + + i_vals = [0, 4, 5, 13] + n_val = 20 + await vector_store_d2.aadd_texts( + [_v_from_i(i, n_val) for i in i_vals], + metadatas=[{"i": i} for i in i_vals], + ) + res1 = await vector_store_d2.amax_marginal_relevance_search( + _v_from_i(3, n_val), + k=2, + fetch_k=3, + ) + res_i_vals = {doc.metadata["i"] for doc in res1} + assert res_i_vals == {"0.0", "4.0"} + + def test_cassandra_vectorstore_metadata_filter( + self, + vector_store_d2: Cassandra, + metadata_documents: list[Document], + ) -> None: + """Metadata filtering.""" + vstore = vector_store_d2 + vstore.add_documents(metadata_documents) + # no filters + res0 = vstore.similarity_search("[-1,-1]", k=10) + assert {doc.metadata["letter"] for doc in res0} == set("qwreio") + # single filter + res1 = vstore.similarity_search( + "[-1,-1]", + k=10, + filter={"group": "vowel"}, + ) + assert {doc.metadata["letter"] for doc in res1} == set("eio") + # multiple filters + res2 = vstore.similarity_search( + "[-1,-1]", + k=10, + filter={"group": "consonant", "ord": str(ord("q"))}, + ) + assert {doc.metadata["letter"] for doc in res2} == set("q") + # excessive filters + res3 = vstore.similarity_search( + "[-1,-1]", + k=10, + filter={"group": "consonant", "ord": str(ord("q")), "case": "upper"}, + ) + assert res3 == [] + + def test_cassandra_vectorstore_metadata_search_sync( + self, + vector_store_d2: Cassandra, + metadata_documents: list[Document], + ) -> None: + """Metadata Search""" + vstore = vector_store_d2 + vstore.add_documents(metadata_documents) + # no filters + res0 = vstore.metadata_search(filter={}, n=10) + assert {doc.metadata["letter"] for doc in res0} == set("qwreio") + # single filter + res1 = vstore.metadata_search( + n=10, + filter={"group": "vowel"}, + ) + assert {doc.metadata["letter"] for doc in res1} == set("eio") + # multiple filters + res2 = vstore.metadata_search( + n=10, + filter={"group": "consonant", "ord": str(ord("q"))}, + ) + assert {doc.metadata["letter"] for doc in res2} == set("q") + # excessive filters + res3 = vstore.metadata_search( + n=10, + filter={"group": "consonant", "ord": str(ord("q")), "case": "upper"}, + ) + assert res3 == [] + + async def test_cassandra_vectorstore_metadata_search_async( + self, + vector_store_d2: Cassandra, + metadata_documents: list[Document], + ) -> None: + """Metadata Search""" + vstore = vector_store_d2 + await vstore.aadd_documents(metadata_documents) + # no filters + res0 = await vstore.ametadata_search(filter={}, n=10) + assert {doc.metadata["letter"] for doc in res0} == set("qwreio") + # single filter + res1 = vstore.metadata_search( + n=10, + filter={"group": "vowel"}, + ) + assert {doc.metadata["letter"] for doc in res1} == set("eio") + # multiple filters + res2 = await vstore.ametadata_search( + n=10, + filter={"group": "consonant", "ord": str(ord("q"))}, + ) + assert {doc.metadata["letter"] for doc in res2} == set("q") + # excessive filters + res3 = await vstore.ametadata_search( + n=10, + filter={"group": "consonant", "ord": str(ord("q")), "case": "upper"}, + ) + assert res3 == [] + + def test_cassandra_vectorstore_get_by_document_id_sync( + self, + vector_store_d2: Cassandra, + metadata_documents: list[Document], + ) -> None: + """Get by document_id""" + vstore = vector_store_d2 + vstore.add_documents(metadata_documents) + # invalid id + invalid = vstore.get_by_document_id(document_id="z") + assert invalid is None + # valid id + valid = vstore.get_by_document_id(document_id="q") + assert isinstance(valid, Document) + assert valid.id == "q" + assert valid.page_content == "[1,2]" + assert valid.metadata["group"] == "consonant" + assert valid.metadata["letter"] == "q" + + async def test_cassandra_vectorstore_get_by_document_id_async( + self, + vector_store_d2: Cassandra, + metadata_documents: list[Document], + ) -> None: + """Get by document_id""" + vstore = vector_store_d2 + await vstore.aadd_documents(metadata_documents) + # invalid id + invalid = await vstore.aget_by_document_id(document_id="z") + assert invalid is None + # valid id + valid = await vstore.aget_by_document_id(document_id="q") + assert isinstance(valid, Document) + assert valid.id == "q" + assert valid.page_content == "[1,2]" + assert valid.metadata["group"] == "consonant" + assert valid.metadata["letter"] == "q" + + @pytest.mark.parametrize( + ("texts", "query"), + [ + ( + ["[1,1]", "[-1,-1]"], + "[0.99999,1.00001]", + ), + ], ) - scores = [sco for _, sco in res1] - sco_near, sco_far = scores - assert sco_far >= 0 - assert abs(1 - sco_near) < MATCH_EPSILON - assert sco_far < EUCLIDEAN_MIN_SIM_UNIT_VECTORS + MATCH_EPSILON - - -def test_cassandra_vectorstore_massive_delete() -> None: - """Larger-scale bulk deletes.""" - vstore = _vectorstore_from_documents( - docs=[], - embedding=_embedding_d2(), + def test_cassandra_vectorstore_similarity_scale_sync( + self, + *, + vector_store_d2: Cassandra, + texts: list[str], + query: str, + ) -> None: + """Scale of the similarity scores.""" + vstore = vector_store_d2 + vstore.add_texts( + texts=texts, + ids=["near", "far"], + ) + res1 = vstore.similarity_search_with_score( + query, + k=2, + ) + scores = [sco for _, sco in res1] + sco_near, sco_far = scores + assert sco_far >= 0 + assert abs(1 - sco_near) < MATCH_EPSILON + assert sco_far < EUCLIDEAN_MIN_SIM_UNIT_VECTORS + MATCH_EPSILON + + @pytest.mark.parametrize( + ("texts", "query"), + [ + ( + ["[1,1]", "[-1,-1]"], + "[0.99999,1.00001]", + ), + ], ) - m = 150 - texts = [f"[0,{i + 1 / 7.0}]" for i in range(2 * m)] - ids0 = [f"doc_{i}" for i in range(m)] - ids1 = [f"doc_{i + m}" for i in range(m)] - ids = ids0 + ids1 - vstore.add_texts(texts=texts, ids=ids) - # deleting a bunch of these - del_res0 = vstore.delete(ids0) - assert del_res0 is True - # deleting the rest plus a fake one - del_res1 = vstore.delete([*ids1, "ghost!"]) - assert del_res1 is True # ensure no error - # nothing left - assert vstore.similarity_search("[-1,-1]", k=2 * m) == [] + async def test_cassandra_vectorstore_similarity_scale_async( + self, + *, + vector_store_d2: Cassandra, + texts: list[str], + query: str, + ) -> None: + """Scale of the similarity scores, async version.""" + vstore = vector_store_d2 + await vstore.aadd_texts( + texts=texts, + ids=["near", "far"], + ) + res1 = await vstore.asimilarity_search_with_score( + query, + k=2, + ) + scores = [sco for _, sco in res1] + sco_near, sco_far = scores + assert sco_far >= 0 + assert abs(1 - sco_near) < MATCH_EPSILON + assert sco_far < EUCLIDEAN_MIN_SIM_UNIT_VECTORS + MATCH_EPSILON + + def test_cassandra_vectorstore_massive_delete( + self, + vector_store_d2: Cassandra, + ) -> None: + """Larger-scale bulk deletes.""" + vstore = vector_store_d2 + m = 150 + texts = [f"[0,{i + 1 / 7.0}]" for i in range(2 * m)] + ids0 = [f"doc_{i}" for i in range(m)] + ids1 = [f"doc_{i + m}" for i in range(m)] + ids = ids0 + ids1 + vstore.add_texts(texts=texts, ids=ids) + # deleting a bunch of these + del_res0 = vstore.delete(ids0) + assert del_res0 is True + # deleting the rest plus a fake one + del_res1 = vstore.delete([*ids1, "ghost!"]) + assert del_res1 is True # ensure no error + # nothing left + assert vstore.similarity_search("[-1,-1]", k=2 * m) == []