Skip to content

Commit

Permalink
Set add docs default refresh to false (#601)
Browse files Browse the repository at this point in the history
set add docs and delete docs APIs, add docs params orchestrator default refresh to false. added tests and examples for auto_refresh=True

---------

Co-authored-by: pandu-k <[email protected]>
  • Loading branch information
vicilliar and pandu-k authored Sep 15, 2023
1 parent 22f3fa6 commit 88a7fa3
Show file tree
Hide file tree
Showing 17 changed files with 163 additions and 28 deletions.
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ mq.index("my-first-index").add_documents([
"mobility, life support, and communications for astronauts",
"_id": "article_591"
}],
tensor_fields=["Title", "Description"]
tensor_fields=["Title", "Description"],
auto_refresh=True
)

results = mq.index("my-first-index").search(
Expand All @@ -133,6 +134,7 @@ results = mq.index("my-first-index").search(
- `mq` is the client that wraps the `marqo` API.
- `create_index()` creates a new index with default settings. You have the option to specify what model to use. For example, `mq.create_index("my-first-index", model="hf/all_datasets_v4_MiniLM-L6")` will create an index with the default text model `hf/all_datasets_v4_MiniLM-L6`. Experimentation with different models is often required to achieve the best retrieval for your specific use case. Different models also offer a tradeoff between inference speed and relevancy. See [here](https://docs.marqo.ai/1.0.0/Models-Reference/dense_retrieval/) for the full list of models.
- `add_documents()` takes a list of documents, represented as python dicts for indexing.
- The `auto_refresh` parameter ensures that documents are available for search after being added. When performing heavy add_documents operations, leave this as `False` for optimal indexing and search performance.
- You can optionally set a document's ID with the special `_id` field. Otherwise, Marqo will generate one.

Let's have a look at the results:
Expand Down Expand Up @@ -240,9 +242,7 @@ response = mq.index("my-multimodal-index").add_documents([{
"My Image": "https://raw.githubusercontent.com/marqo-ai/marqo-api-tests/mainline/assets/ai_hippo_realistic.png",
"Description": "The hippopotamus, also called the common hippopotamus or river hippopotamus, is a large semiaquatic mammal native to sub-Saharan Africa",
"_id": "hippo-facts"
}], tensor_fields=["My Image", "Description"])

```
}], tensor_fields=["My Image", "Description"], auto_refresh=True)

```

Expand Down Expand Up @@ -303,7 +303,8 @@ mq.index("my-weighted-query-index").add_documents(
"The last known of its species died in 1936.",
},
],
tensor_fields=["Title", "Description"]
tensor_fields=["Title", "Description"],
auto_refresh=True
)

# initially we ask for a type of communications device which is popular in the 21st century
Expand Down Expand Up @@ -392,7 +393,8 @@ mq.index("my-first-multimodal-index").add_documents(
},
# We specify which fields to create vectors for.
# Note that captioned_image is treated as a single field.
tensor_fields=["Title", "captioned_image"]
tensor_fields=["Title", "captioned_image"],
auto_refresh=True
)

# Search this index with a simple text query
Expand Down
2 changes: 1 addition & 1 deletion examples/ClothingCLI/simple_marqo_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def load_index(index_name: str, number_data: int) -> None:

mq.create_index(index_name, **settings)

mq.index(index_name).add_documents(shirt_data, tensor_fields=['image','label','kids'])
mq.index(index_name).add_documents(shirt_data, tensor_fields=['image','label','kids'], auto_refresh=True)

print("Index successfully created.")

Expand Down
2 changes: 1 addition & 1 deletion examples/ClothingStreamlit/streamlit_marqo_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def load_index(number_data):
mq.create_index("demo-search-index", **settings)

with st.spinner("Creating Index..."):
mq.index("demo-search-index").add_documents(shirt_data, tensor_fields=['image', 'label', 'kids'])
mq.index("demo-search-index").add_documents(shirt_data, tensor_fields=['image', 'label', 'kids'], auto_refresh=True)

st.success("Index successfully created.")
except:
Expand Down
4 changes: 2 additions & 2 deletions examples/GPT-examples/article/article.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ mq.create_index(index_name)
```
Now we index the documents
```python
results = mq.index(index_name).add_documents(documents, tensor_fields = ["name", "text"])
results = mq.index(index_name).add_documents(documents, tensor_fields = ["name", "text"], auto_refresh=True)
```

We can search and see what comes back.
Expand Down Expand Up @@ -353,7 +353,7 @@ We can patch, delete or add documents for the agents background with Marqo. Lets
```python
from iron_data import get_extra_data
extra_docs = [{"text":text, "name":persona} for text in get_extra_data()]
res = mq.index(index_name).add_documents(extra_docs, tensor_fields = ["name", "text"])
res = mq.index(index_name).add_documents(extra_docs, tensor_fields = ["name", "text"], auto_refresh=True)
```
This adds some of the safety information from the iron manual. We will also take the bottom ranked results (i.e least relevant) to make it interesting. The following is the conversation - we can see it weaving its new background into the story nicely!

Expand Down
2 changes: 1 addition & 1 deletion examples/GPT3NewsSummary/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ print('creating a Marqo index')
mq.create_index(DOC_INDEX_NAME)
print('Indexing documents')
mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"])
mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"], auto_refresh=True)
```

Now we have indexed our news documents, we can simply use Marqo Python search API to return relevant context for our GPT3 generation. For query "q", we use the question and want to match news context based on the "Title" and "Description" text. We also want to filter our documents for "today", which was '2022-11-09'.
Expand Down
2 changes: 1 addition & 1 deletion examples/GPT3NewsSummary/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
mq.create_index(DOC_INDEX_NAME)

print('Indexing documents')
mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"])
mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS, tensor_fields= ["Title", "Description"], auto_refresh=True)
print('Done')


Expand Down
2 changes: 1 addition & 1 deletion examples/ImageSearchGuide/ImageSearchGuide.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ output:

Add the documents into the previously created index using function `add_documents()`
```python
mq.index(index_name).add_documents(documents, tensor_fields=["image_docker"], device="cpu", client_batch_size= 1)
mq.index(index_name).add_documents(documents, tensor_fields=["image_docker"], device="cpu", client_batch_size= 1, auto_refresh=True)
```
```python
outputs:
Expand Down
4 changes: 2 additions & 2 deletions examples/ImageSearchGuide/imagesearchguide.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions examples/MultiLingual/article.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ One small adjustment we'll make is to split up text of very long documents (of o
At the end of each loop, we call the `add_documents()` function to insert the document:
```python
mq.index(index_name='my-multilingual-index').add_documents(
device='cuda', auto_refresh=False,
device='cuda',
documents=[{
"_id": doc_id,
"language": lang,
Expand All @@ -83,7 +83,7 @@ Here we set the device argument as `"cuda"`. This tells Marqo to use the GPU it
If you don't have a GPU, remove this argument or set it to `"cpu"`. We encourage using a GPU as it will make the `add_documents`
process significantly faster (our testing showed a 6–12x speed up).

We also set the `auto_refresh` argument to `False`. When indexing large volumes of data we encourage you to set this to False, as it optimises the `add_documents` process.
We do not set the `auto_refresh` argument, so it defaults to `False`. When indexing large volumes of data we encourage you to keep this `False`, as it optimises the `add_documents` process.

And that's the indexing process! Run the script to fill up the Marqo index with documents. It took us around 45 minutes
with an AWS _ml.g4dn.2xlarge_ machine.
Expand Down
4 changes: 2 additions & 2 deletions examples/MultiLingual/eu_legal.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ def build_index():
# Index the document. The device is set to 'cuda' to take
# advantage of the machine's GPU. If you don't have a GPU,
# change this argument to 'cpu'.
# We set auto_refresh to False which is optimal for indexing
# We do not set auto_refresh, which will make it default to False. This is optimal for indexing
# a lot of documents.
mq.index(index_name=INDEX_NAME).add_documents(
documents=[to_post], device=DEVICE, auto_refresh=False,
documents=[to_post], device=DEVICE,
tensor_fields=["language", "text", "labels"]
)
t1 = datetime.datetime.now()
Expand Down
4 changes: 2 additions & 2 deletions examples/MultiModalSearch/article.md
Original file line number Diff line number Diff line change
Expand Up @@ -494,10 +494,10 @@ mappings2 = {"multimodal":
}}}

# index the document
res = client.index(index_name_context).add_documents([document1], tensor_fields=["multimodal"], device=device, mappings=mappings1)
res = client.index(index_name_context).add_documents([document1], tensor_fields=["multimodal"], device=device, mappings=mappings1, auto_refresh=True)

# index the other using a different mappings
res = client.index(index_name_context).add_documents([document2], tensor_fields=["multimodal"], device=device, mappings=mappings2)
res = client.index(index_name_context).add_documents([document2], tensor_fields=["multimodal"], device=device, mappings=mappings2, auto_refresh=True)
```

To get the vectors to use as context vectors at search time - we need to [retrieve the calculated vectors](https://marqo.pages.dev/0.0.21/API-Reference/documents/). We can then [create a context object](https://marqo.pages.dev/0.0.21/API-Reference/search/#context) that is used at search time.
Expand Down
6 changes: 3 additions & 3 deletions src/marqo/tensor_search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def add_or_replace_documents(
request: Request,
body: typing.Union[AddDocsBodyParams, List[Dict]],
index_name: str,
refresh: bool = True,
refresh: bool = False,
marqo_config: config.Config = Depends(generate_config),
non_tensor_fields: Optional[List[str]] = Query(default=None),
device: str = Depends(api_validation.validate_device),
Expand Down Expand Up @@ -235,10 +235,10 @@ def delete_index(index_name: str, marqo_config: config.Config = Depends(generate
config=marqo_config, index_name=index_name
)


@app.post("/indexes/{index_name}/documents/delete-batch")
def delete_docs(index_name: str, documentIds: List[str], refresh: bool = True,
def delete_docs(index_name: str, documentIds: List[str], refresh: bool = False,
marqo_config: config.Config = Depends(generate_config)):

return tensor_search.delete_documents(
index_name=index_name, config=marqo_config, doc_ids=documentIds,
auto_refresh=refresh
Expand Down
2 changes: 1 addition & 1 deletion src/marqo/tensor_search/index_meta_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_cache() -> Dict[str, IndexInfo]:


def refresh_index_info_on_interval(config: Config, index_name: str, interval_seconds: int) -> None:
"""Refreshes an index's index_info if inteval_seconds have elapsed since the last time it was refreshed
"""Refreshes an index's index_info if interval_seconds have elapsed since the last time it was refreshed
Non-thread safe, so there is a chance two threads both refresh index_info at the same time.
"""
Expand Down
6 changes: 5 additions & 1 deletion src/marqo/tensor_search/models/add_docs_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ class AddDocsParamsConfig:


class AddDocsBodyParams(BaseModel):
"""The parameters of the body parameters of tensor_search_add_documents() function"""
"""
Representation of the body parameters of the API add_or_replace_documents() function.
This will be processed by add_docs_params_orchestrator (along with other query parameters)
into an AddDocsParams object to be given to tensor_search.add_documents()
"""
class Config:
arbitrary_types_allowed = True
allow_mutation = False
Expand Down
2 changes: 1 addition & 1 deletion src/marqo/tensor_search/web/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def decode_mappings(mappings: Optional[str] = None) -> dict:


def add_docs_params_orchestrator(index_name: str, body: Union[AddDocsBodyParams, List[Dict]],
device: str, auto_refresh: bool = True, non_tensor_fields: Optional[List[str]] = None,
device: str, auto_refresh: bool = False, non_tensor_fields: Optional[List[str]] = None,
mappings: Optional[dict] = dict(), model_auth: Optional[ModelAuth] = None,
image_download_headers: Optional[dict] = dict(),
use_existing_tensors: Optional[bool] = False, query_parameters: Optional[Dict] = dict()) -> AddDocsParams:
Expand Down
106 changes: 105 additions & 1 deletion tests/tensor_search/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tests.marqo_test import MarqoTestCase


class ApiTests(MarqoTestCase):
class ApiTestsAddDocs(MarqoTestCase):
def setUp(self):
api.OPENSEARCH_URL = 'http://localhost:0000'
self.client = TestClient(api.app)
Expand Down Expand Up @@ -90,3 +90,107 @@ def test_add_or_replace_documents_fields_undefined_query_param(self):
)
self.assertEqual(response.status_code, 400)
mock_add_documents.assert_not_called()

def test_add_or_replace_documents_defaults(self):
"""
Ensures that API calls to add or replace documents call tensor_search.add_documents
with the correct defaults (eg. auto_refresh)
"""
with mock.patch('marqo.tensor_search.tensor_search.add_documents') as mock_add_documents:
response = self.client.post(
"/indexes/index1/documents?device=cpu",
json={
"documents": [
{
"id": "1",
"text": "This is a test document",
}
],
"tensorFields": ['text']
},
)
self.assertEqual(response.status_code, 200)
mock_add_documents.assert_called_once()
args, kwargs = mock_add_documents.call_args

# Assert that add documents is called with the correct default arguments
assert kwargs["add_docs_params"].auto_refresh == False
assert kwargs["add_docs_params"].use_existing_tensors == False

def test_add_or_replace_documents_auto_refresh_true(self):
"""
Ensures that calling add documents with some parameters set to non-default values
(refresh, use_existing_tensors) works as expected.
"""
with mock.patch('marqo.tensor_search.tensor_search.add_documents') as mock_add_documents:
response = self.client.post(
"/indexes/index1/documents?device=cpu&refresh=true",
json={
"documents": [
{
"id": "1",
"text": "This is a test document",
}
],
"useExistingTensors": True,
"tensorFields": ['text']
},
)

self.assertEqual(response.status_code, 200)
mock_add_documents.assert_called_once()
args, kwargs = mock_add_documents.call_args

# Assert that add documents is called with the correct new arguments
assert kwargs["add_docs_params"].auto_refresh == True
assert kwargs["add_docs_params"].use_existing_tensors == True


class ApiTestsDeleteDocs(MarqoTestCase):
def setUp(self):
api.OPENSEARCH_URL = 'http://localhost:0000'
self.client = TestClient(api.app)

def test_delete_docs_defaults(self):
"""
Ensures that API calls to delete documents call tensor_search.delete_documents
with the correct defaults (eg. auto_refresh)
"""

with mock.patch('marqo.tensor_search.tensor_search.delete_documents') as mock_delete_documents:
response = self.client.post(
"/indexes/index1/documents/delete-batch",
json=['0', '1', '2']
)
"""
TODO: figure out why this format results in an error:
json={
"documentIds": ['0', '1', '2']
}
"""

self.assertEqual(response.status_code, 200)
mock_delete_documents.assert_called_once()
args, kwargs = mock_delete_documents.call_args

# Assert that delete_documents is called with the correct default arguments
assert kwargs["auto_refresh"] == False

def test_delete_docs_auto_refresh_true(self):
"""
Ensures that API calls to delete documents with parameters set (auto_refresh=True)
reflect those in calls to tensor_search.delete_documents
"""

with mock.patch('marqo.tensor_search.tensor_search.delete_documents') as mock_delete_documents:
response = self.client.post(
"/indexes/index1/documents/delete-batch?refresh=true",
json=['0', '1', '2']
)

self.assertEqual(response.status_code, 200)
mock_delete_documents.assert_called_once()
args, kwargs = mock_delete_documents.call_args

# Assert that delete_documents is called with the correct new arguments
assert kwargs["auto_refresh"] == True
25 changes: 25 additions & 0 deletions tests/tensor_search/test_api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,31 @@ def test_add_docs_params_orchestrator(self):
assert result.docs == [{"test": "doc"}]
assert result.image_download_headers == {"header1": "value1"}

def test_add_docs_params_orchestrator_defaults(self):
"""
Ensures that certain defaults are correct when not specified upon calls to
add_docs_params_orchestrator (like use_existing_tensors, auto_refresh, etc)
"""

# Set up the bare minimum arguments for the function
index_name = "test-index"
body = AddDocsBodyParams(documents=[{"test": "doc"}],
nonTensorFields=["field1"],
imageDownloadHeaders={"header1": "value1"},
modelAuth=ModelAuth(s3=S3Auth(aws_secret_access_key="test", aws_access_key_id="test")),
mappings={"map1": "value1"})
device = "test-device"

# Call the function with the arguments
result = add_docs_params_orchestrator(index_name, body, device)

# Assert that the defaults are correct
assert isinstance(result, AddDocsParams)
assert result.use_existing_tensors == False
assert result.auto_refresh == False



def test_add_docs_params_orchestrator_deprecated_query_parameters(self):
# Set up the arguments for the function
index_name = "test-index"
Expand Down

0 comments on commit 88a7fa3

Please sign in to comment.