diff --git a/meilisearch/index.py b/meilisearch/index.py index 66d1a806..76f16cc4 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -329,6 +329,39 @@ def add_documents(self, documents, primary_key=None): url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}' return self.http.post(url, documents) + def add_documents_in_batches(self, documents, batch_size=1000, primary_key=None): + """Add documents to the index in batches. + + Parameters + ---------- + documents: list + List of documents. Each document should be a dictionary. + batch_size (optional): int + The number of documents that should be included in each batch. Default = 1000 + primary_key (optional): string + The primary-key used in index. Ignored if already set up. + + Returns + ------- + update: list[dict] + List of dictionaries containing an update ids to track the action: + https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status + + Raises + ------ + MeiliSearchApiError + An error containing details about why MeiliSearch can't process your request. + MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors + """ + + update_ids = [] + + for document_batch in self._batch(documents, batch_size): + update_id = self.add_documents(document_batch, primary_key) + update_ids.append(update_id) + + return update_ids + def update_documents(self, documents, primary_key=None): """Update documents in the index. @@ -357,6 +390,38 @@ def update_documents(self, documents, primary_key=None): url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}' return self.http.put(url, documents) + def update_documents_in_batches(self, documents, batch_size=1000, primary_key=None): + """Update documents to the index in batches. + + Parameters + ---------- + documents: list + List of documents. Each document should be a dictionary. + batch_size (optional): int + The number of documents that should be included in each batch. Default = 1000 + primary_key (optional): string + The primary-key used in index. Ignored if already set up. + + Returns + ------- + update: list[dict] + List of dictionaries containing an update ids to track the action: + https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status + + Raises + ------ + MeiliSearchApiError + An error containing details about why MeiliSearch can't process your request. + MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors + """ + + update_ids = [] + + for document_batch in self._batch(documents, batch_size): + update_id = self.update_documents(document_batch, primary_key) + update_ids.append(update_id) + + return update_ids def delete_document(self, document_id): """Delete one document from the index. @@ -935,5 +1000,11 @@ def reset_attributes_for_faceting(self): self.__settings_url_for(self.config.paths.attributes_for_faceting), ) + @staticmethod + def _batch(documents, batch_size): + total_len = len(documents) + for i in range(0, total_len, batch_size): + yield documents[i : i + batch_size] + def __settings_url_for(self, sub_route): return f'{self.config.paths.index}/{self.uid}/{self.config.paths.setting}/{sub_route}' diff --git a/meilisearch/tests/index/test_index_document_meilisearch.py b/meilisearch/tests/index/test_index_document_meilisearch.py index ea710e73..57b5d4e3 100644 --- a/meilisearch/tests/index/test_index_document_meilisearch.py +++ b/meilisearch/tests/index/test_index_document_meilisearch.py @@ -1,5 +1,7 @@ # pylint: disable=invalid-name +from math import ceil + import pytest def test_get_documents_default(empty_index): @@ -18,6 +20,28 @@ def test_add_documents(empty_index, small_movies): assert index.get_primary_key() == 'id' assert update['status'] == 'processed' +@pytest.mark.parametrize("batch_size", [2, 3, 1000]) +@pytest.mark.parametrize( + "primary_key, expected_primary_key", [("release_date", "release_date"), (None, "id")] +) +def test_add_documents_in_batches( + batch_size, + primary_key, + expected_primary_key, + empty_index, + small_movies, +): + index = empty_index() + response = index.add_documents_in_batches(small_movies, batch_size, primary_key) + assert ceil(len(small_movies) / batch_size) == len(response) + + for r in response: + assert "updateId" in r + update = index.wait_for_pending_update(r["updateId"]) + assert update["status"] == "processed" + + assert index.get_primary_key() == expected_primary_key + def test_get_document(index_with_documents): """Tests getting one document from a populated index.""" response = index_with_documents().get_document('500682') @@ -66,6 +90,28 @@ def test_update_documents(index_with_documents, small_movies): response = index.get_documents() assert response[0]['title'] != 'Some title' +@pytest.mark.parametrize("batch_size", [2, 3, 1000]) +@pytest.mark.parametrize( + "primary_key, expected_primary_key", [("release_date", "release_date"), (None, "id")] +) +def test_update_documents_in_batches( + batch_size, + primary_key, + expected_primary_key, + empty_index, + small_movies, +): + index = empty_index() + response = index.update_documents_in_batches(small_movies, batch_size, primary_key) + assert ceil(len(small_movies) / batch_size) == len(response) + + for r in response: + assert "updateId" in r + update = index.wait_for_pending_update(r["updateId"]) + assert update["status"] == "processed" + + assert index.get_primary_key() == expected_primary_key + def test_delete_document(index_with_documents): """Tests deleting a single document.""" index = index_with_documents()