Set add docs default refresh to false (#601)

set add docs and delete docs APIs, add docs params orchestrator default refresh to false. added tests and examples for auto_refresh=True --------- Co-authored-by: pandu-k <[email protected]>
marqo-ai · Sep 15, 2023 · 88a7fa3 · 88a7fa3
1 parent 22f3fa6
commit 88a7fa3
Show file tree

Hide file tree

Showing 17 changed files with 163 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -121,7 +121,8 @@ mq.index("my-first-index").add_documents([
                        "mobility, life support, and communications for astronauts",
         "_id": "article_591"
     }],
-    tensor_fields=["Title", "Description"]
+    tensor_fields=["Title", "Description"],
+    auto_refresh=True
 )
 
 results = mq.index("my-first-index").search(
@@ -133,6 +134,7 @@ results = mq.index("my-first-index").search(
 - `mq` is the client that wraps the `marqo` API.
 - `create_index()` creates a new index with default settings. You have the option to specify what model to use. For example, `mq.create_index("my-first-index", model="hf/all_datasets_v4_MiniLM-L6")` will create an index with the default text model `hf/all_datasets_v4_MiniLM-L6`. Experimentation with different models is often required to achieve the best retrieval for your specific use case. Different models also offer a tradeoff between inference speed and relevancy. See [here](https://docs.marqo.ai/1.0.0/Models-Reference/dense_retrieval/) for the full list of models.
 - `add_documents()` takes a list of documents, represented as python dicts for indexing.
+    - The `auto_refresh` parameter ensures that documents are available for search after being added. When performing heavy add_documents operations, leave this as `False` for optimal indexing and search performance.
 - You can optionally set a document's ID with the special `_id` field. Otherwise, Marqo will generate one.
 
 Let's have a look at the results:
@@ -240,9 +242,7 @@ response = mq.index("my-multimodal-index").add_documents([{
     "My Image": "https://raw.githubusercontent.com/marqo-ai/marqo-api-tests/mainline/assets/ai_hippo_realistic.png",
     "Description": "The hippopotamus, also called the common hippopotamus or river hippopotamus, is a large semiaquatic mammal native to sub-Saharan Africa",
     "_id": "hippo-facts"
-}], tensor_fields=["My Image", "Description"])
-
-```
+}], tensor_fields=["My Image", "Description"], auto_refresh=True)
 
 ```
 
@@ -303,7 +303,8 @@ mq.index("my-weighted-query-index").add_documents(
             "The last known of its species died in 1936.",
         },
     ],
-    tensor_fields=["Title", "Description"]
+    tensor_fields=["Title", "Description"],
+    auto_refresh=True
 )
 
 # initially we ask for a type of communications device which is popular in the 21st century
@@ -392,7 +393,8 @@ mq.index("my-first-multimodal-index").add_documents(
     },
     # We specify which fields to create vectors for. 
     # Note that captioned_image is treated as a single field.
-    tensor_fields=["Title", "captioned_image"]
+    tensor_fields=["Title", "captioned_image"],
+    auto_refresh=True
 )
 
 # Search this index with a simple text query

diff --git a/examples/ClothingCLI/simple_marqo_demo.py b/examples/ClothingCLI/simple_marqo_demo.py
@@ -30,7 +30,7 @@ def load_index(index_name: str, number_data: int) -> None:
 
         mq.create_index(index_name, **settings)
 
-        mq.index(index_name).add_documents(shirt_data, tensor_fields=['image','label','kids'])
+        mq.index(index_name).add_documents(shirt_data, tensor_fields=['image','label','kids'], auto_refresh=True)
 
         print("Index successfully created.")
 

diff --git a/examples/ClothingStreamlit/streamlit_marqo_demo.py b/examples/ClothingStreamlit/streamlit_marqo_demo.py
@@ -34,7 +34,7 @@ def load_index(number_data):
         mq.create_index("demo-search-index", **settings)
 
         with st.spinner("Creating Index..."):
-            mq.index("demo-search-index").add_documents(shirt_data, tensor_fields=['image', 'label', 'kids'])
+            mq.index("demo-search-index").add_documents(shirt_data, tensor_fields=['image', 'label', 'kids'], auto_refresh=True)
 
         st.success("Index successfully created.")
     except:

diff --git a/examples/GPT-examples/article/article.md b/examples/GPT-examples/article/article.md
@@ -230,7 +230,7 @@ mq.create_index(index_name)
 ```
 Now we index the documents
 ```python
-results = mq.index(index_name).add_documents(documents, tensor_fields = ["name", "text"])
+results = mq.index(index_name).add_documents(documents, tensor_fields = ["name", "text"], auto_refresh=True)
 ```
 
 We can search and see what comes back.
@@ -353,7 +353,7 @@ We can patch, delete or add documents for the agents background with Marqo. Lets
 ```python
 from iron_data import get_extra_data
 extra_docs = [{"text":text, "name":persona} for text in get_extra_data()]
-res = mq.index(index_name).add_documents(extra_docs, tensor_fields = ["name", "text"])
+res = mq.index(index_name).add_documents(extra_docs, tensor_fields = ["name", "text"], auto_refresh=True)
 ```
 This adds some of the safety information from the iron manual. We will also take the bottom ranked results (i.e least relevant) to make it interesting. The following is the conversation - we can see it weaving its new background into the story nicely!
 

diff --git a/examples/GPT3NewsSummary/README.md b/examples/GPT3NewsSummary/README.md
@@ -83,7 +83,7 @@ print('creating a Marqo index')
 mq.create_index(DOC_INDEX_NAME)
 
 print('Indexing documents')
-mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"])
+mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"], auto_refresh=True)
 ```  
 
 Now we have indexed our news documents, we can simply use Marqo Python search API to return relevant context for our GPT3 generation.  For query "q", we use the question and want to match news context based on the "Title" and "Description" text. We also want to filter our documents for "today", which was '2022-11-09'.   

diff --git a/examples/GPT3NewsSummary/main.py b/examples/GPT3NewsSummary/main.py
@@ -38,7 +38,7 @@
         mq.create_index(DOC_INDEX_NAME)
 
         print('Indexing documents')
-        mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"])
+        mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"], auto_refresh=True)
         print('Done')
 
 

diff --git a/examples/ImageSearchGuide/ImageSearchGuide.md b/examples/ImageSearchGuide/ImageSearchGuide.md
@@ -139,7 +139,7 @@ output:
 
 Add the documents into the previously created index using function `add_documents()`
 ```python
-mq.index(index_name).add_documents(documents, tensor_fields=["image_docker"], device="cpu", client_batch_size= 1)
+mq.index(index_name).add_documents(documents, tensor_fields=["image_docker"], device="cpu", client_batch_size= 1, auto_refresh=True)
 ```
 ```python
 outputs:

diff --git a/examples/ImageSearchGuide/imagesearchguide.ipynb b/examples/ImageSearchGuide/imagesearchguide.ipynb
diff --git a/examples/MultiLingual/article.md b/examples/MultiLingual/article.md
@@ -68,7 +68,7 @@ One small adjustment we'll make is to split up text of very long documents (of o
 At the end of each loop, we call the `add_documents()` function to insert the document:
 ```python
 mq.index(index_name='my-multilingual-index').add_documents(
-    device='cuda', auto_refresh=False,
+    device='cuda', 
     documents=[{
                     "_id": doc_id,
                     "language": lang,
@@ -83,7 +83,7 @@ Here we set the device argument as `"cuda"`. This tells Marqo to use the GPU it
 If you don't have a GPU, remove this argument or set it to `"cpu"`. We encourage using a GPU as it will make the `add_documents` 
 process significantly faster (our testing showed a 6–12x speed up). 
 
-We also set the `auto_refresh` argument to `False`. When indexing large volumes of data we encourage you to set this to False, as it optimises the `add_documents` process. 
+We do not set the `auto_refresh` argument, so it defaults to `False`. When indexing large volumes of data we encourage you to keep this `False`, as it optimises the `add_documents` process. 
 
 And that's the indexing process! Run the script to fill up the Marqo index with documents. It took us around 45 minutes 
 with an AWS _ml.g4dn.2xlarge_ machine. 

diff --git a/examples/MultiLingual/eu_legal.py b/examples/MultiLingual/eu_legal.py
@@ -75,10 +75,10 @@ def build_index():
                 # Index the document. The device is set to 'cuda' to take
                 # advantage of the machine's GPU. If you don't have a GPU,
                 # change this argument to 'cpu'.
-                # We set auto_refresh to False which is optimal for indexing
+                # We do not set auto_refresh, which will make it default to False. This is optimal for indexing
                 # a lot of documents.
                 mq.index(index_name=INDEX_NAME).add_documents(
-                    documents=[to_post], device=DEVICE, auto_refresh=False,
+                    documents=[to_post], device=DEVICE,
                     tensor_fields=["language", "text", "labels"]
                 )
     t1 = datetime.datetime.now()

diff --git a/examples/MultiModalSearch/article.md b/examples/MultiModalSearch/article.md
@@ -494,10 +494,10 @@ mappings2 = {"multimodal":
                             }}}
 
 # index the document             
-res = client.index(index_name_context).add_documents([document1], tensor_fields=["multimodal"], device=device, mappings=mappings1)
+res = client.index(index_name_context).add_documents([document1], tensor_fields=["multimodal"], device=device, mappings=mappings1, auto_refresh=True)
 
 # index the other using a different mappings
-res = client.index(index_name_context).add_documents([document2], tensor_fields=["multimodal"], device=device, mappings=mappings2)
+res = client.index(index_name_context).add_documents([document2], tensor_fields=["multimodal"], device=device, mappings=mappings2, auto_refresh=True)
 ```
 
 To get the vectors to use as context vectors at search time - we need to [retrieve the calculated vectors](https://marqo.pages.dev/0.0.21/API-Reference/documents/). We can then [create a context object](https://marqo.pages.dev/0.0.21/API-Reference/search/#context) that is used at search time.

diff --git a/src/marqo/tensor_search/api.py b/src/marqo/tensor_search/api.py
@@ -172,7 +172,7 @@ def add_or_replace_documents(
         request: Request,
         body: typing.Union[AddDocsBodyParams, List[Dict]],
         index_name: str,
-        refresh: bool = True,
+        refresh: bool = False,
         marqo_config: config.Config = Depends(generate_config),
         non_tensor_fields: Optional[List[str]] = Query(default=None),
         device: str = Depends(api_validation.validate_device),
@@ -235,10 +235,10 @@ def delete_index(index_name: str, marqo_config: config.Config = Depends(generate
         config=marqo_config, index_name=index_name
     )
 
-
 @app.post("/indexes/{index_name}/documents/delete-batch")
-def delete_docs(index_name: str, documentIds: List[str], refresh: bool = True,
+def delete_docs(index_name: str, documentIds: List[str], refresh: bool = False,
                       marqo_config: config.Config = Depends(generate_config)):
+
     return tensor_search.delete_documents(
         index_name=index_name, config=marqo_config, doc_ids=documentIds,
         auto_refresh=refresh

diff --git a/src/marqo/tensor_search/index_meta_cache.py b/src/marqo/tensor_search/index_meta_cache.py
@@ -61,7 +61,7 @@ def get_cache() -> Dict[str, IndexInfo]:
 
 
 def refresh_index_info_on_interval(config: Config, index_name: str, interval_seconds: int) -> None:
-    """Refreshes an index's index_info if inteval_seconds have elapsed since the last time it was refreshed
+    """Refreshes an index's index_info if interval_seconds have elapsed since the last time it was refreshed
 
     Non-thread safe, so there is a chance two threads both refresh index_info at the same time.
     """

diff --git a/src/marqo/tensor_search/models/add_docs_objects.py b/src/marqo/tensor_search/models/add_docs_objects.py
@@ -41,7 +41,11 @@ class AddDocsParamsConfig:
 
 
 class AddDocsBodyParams(BaseModel):
-    """The parameters of the body parameters of tensor_search_add_documents() function"""
+    """
+    Representation of the body parameters of the API add_or_replace_documents() function.
+    This will be processed by add_docs_params_orchestrator (along with other query parameters)
+    into an AddDocsParams object to be given to tensor_search.add_documents()
+    """
     class Config:
         arbitrary_types_allowed = True
         allow_mutation = False

diff --git a/src/marqo/tensor_search/web/api_utils.py b/src/marqo/tensor_search/web/api_utils.py
@@ -136,7 +136,7 @@ def decode_mappings(mappings: Optional[str] = None) -> dict:
 
 
 def add_docs_params_orchestrator(index_name: str, body: Union[AddDocsBodyParams, List[Dict]],
-                                device: str, auto_refresh: bool = True, non_tensor_fields: Optional[List[str]] = None,
+                                device: str, auto_refresh: bool = False, non_tensor_fields: Optional[List[str]] = None,
                                 mappings: Optional[dict] = dict(), model_auth: Optional[ModelAuth] = None,
                                 image_download_headers: Optional[dict] = dict(),
                                 use_existing_tensors: Optional[bool] = False, query_parameters: Optional[Dict] = dict()) -> AddDocsParams:

diff --git a/tests/tensor_search/test_api.py b/tests/tensor_search/test_api.py
@@ -6,7 +6,7 @@
 from tests.marqo_test import MarqoTestCase
 
 
-class ApiTests(MarqoTestCase):
+class ApiTestsAddDocs(MarqoTestCase):
     def setUp(self):
         api.OPENSEARCH_URL = 'http://localhost:0000'
         self.client = TestClient(api.app)
@@ -90,3 +90,107 @@ def test_add_or_replace_documents_fields_undefined_query_param(self):
             )
             self.assertEqual(response.status_code, 400)
             mock_add_documents.assert_not_called()
+
+    def test_add_or_replace_documents_defaults(self):
+        """
+        Ensures that API calls to add or replace documents call tensor_search.add_documents
+        with the correct defaults (eg. auto_refresh)
+        """
+        with mock.patch('marqo.tensor_search.tensor_search.add_documents') as mock_add_documents:
+            response = self.client.post(
+                "/indexes/index1/documents?device=cpu",
+                json={
+                    "documents": [
+                        {
+                            "id": "1",
+                            "text": "This is a test document",
+                        }
+                    ],
+                    "tensorFields": ['text']
+                },
+            )
+            self.assertEqual(response.status_code, 200)
+            mock_add_documents.assert_called_once()
+            args, kwargs = mock_add_documents.call_args
+
+            # Assert that add documents is called with the correct default arguments
+            assert kwargs["add_docs_params"].auto_refresh == False
+            assert kwargs["add_docs_params"].use_existing_tensors == False
+
+    def test_add_or_replace_documents_auto_refresh_true(self):
+        """
+        Ensures that calling add documents with some parameters set to non-default values
+        (refresh, use_existing_tensors) works as expected.
+        """
+        with mock.patch('marqo.tensor_search.tensor_search.add_documents') as mock_add_documents:
+            response = self.client.post(
+                "/indexes/index1/documents?device=cpu&refresh=true",
+                json={
+                    "documents": [
+                        {
+                            "id": "1",
+                            "text": "This is a test document",
+                        }
+                    ],
+                    "useExistingTensors": True,
+                    "tensorFields": ['text']
+                },
+            )
+
+            self.assertEqual(response.status_code, 200)
+            mock_add_documents.assert_called_once()
+            args, kwargs = mock_add_documents.call_args
+
+            # Assert that add documents is called with the correct new arguments
+            assert kwargs["add_docs_params"].auto_refresh == True
+            assert kwargs["add_docs_params"].use_existing_tensors == True
+
+
+class ApiTestsDeleteDocs(MarqoTestCase):
+    def setUp(self):
+        api.OPENSEARCH_URL = 'http://localhost:0000'
+        self.client = TestClient(api.app)
+
+    def test_delete_docs_defaults(self):
+        """
+        Ensures that API calls to delete documents call tensor_search.delete_documents
+        with the correct defaults (eg. auto_refresh)
+        """
+
+        with mock.patch('marqo.tensor_search.tensor_search.delete_documents') as mock_delete_documents:
+            response = self.client.post(
+                "/indexes/index1/documents/delete-batch",
+                json=['0', '1', '2']
+            )
+            """
+            TODO: figure out why this format results in an error:
+            json={
+                "documentIds": ['0', '1', '2']
+            }
+            """
+
+            self.assertEqual(response.status_code, 200)
+            mock_delete_documents.assert_called_once()
+            args, kwargs = mock_delete_documents.call_args
+
+            # Assert that delete_documents is called with the correct default arguments
+            assert kwargs["auto_refresh"] == False
+
+    def test_delete_docs_auto_refresh_true(self):
+        """
+        Ensures that API calls to delete documents with parameters set (auto_refresh=True)
+        reflect those in calls to tensor_search.delete_documents
+        """
+
+        with mock.patch('marqo.tensor_search.tensor_search.delete_documents') as mock_delete_documents:
+            response = self.client.post(
+                "/indexes/index1/documents/delete-batch?refresh=true",
+                json=['0', '1', '2']
+            )
+
+            self.assertEqual(response.status_code, 200)
+            mock_delete_documents.assert_called_once()
+            args, kwargs = mock_delete_documents.call_args
+
+            # Assert that delete_documents is called with the correct new arguments
+            assert kwargs["auto_refresh"] == True
diff --git a/tests/tensor_search/test_api_utils.py b/tests/tensor_search/test_api_utils.py
@@ -113,6 +113,31 @@ def test_add_docs_params_orchestrator(self):
         assert result.docs == [{"test": "doc"}]
         assert result.image_download_headers == {"header1": "value1"}
 
+    def test_add_docs_params_orchestrator_defaults(self):
+        """
+        Ensures that certain defaults are correct when not specified upon calls to 
+        add_docs_params_orchestrator (like use_existing_tensors, auto_refresh, etc)
+        """
+
+        # Set up the bare minimum arguments for the function
+        index_name = "test-index"
+        body = AddDocsBodyParams(documents=[{"test": "doc"}],
+                                    nonTensorFields=["field1"],
+                                    imageDownloadHeaders={"header1": "value1"},
+                                    modelAuth=ModelAuth(s3=S3Auth(aws_secret_access_key="test", aws_access_key_id="test")),
+                                    mappings={"map1": "value1"})
+        device = "test-device"
+
+        # Call the function with the arguments
+        result = add_docs_params_orchestrator(index_name, body, device)
+
+        # Assert that the defaults are correct
+        assert isinstance(result, AddDocsParams)
+        assert result.use_existing_tensors == False
+        assert result.auto_refresh == False
+
+
+
     def test_add_docs_params_orchestrator_deprecated_query_parameters(self):
         # Set up the arguments for the function
         index_name = "test-index"