Skip to content

Commit 1f8fdc2

Browse files
bors[bot]sanders41
andauthored
Merge #260
260: Adding method to add documents in batches r=bidoubiwa a=sanders41 Relates to meilisearch/integration-guides#106 Co-authored-by: Paul Sanders <[email protected]>
2 parents 543bd09 + 2d7f95d commit 1f8fdc2

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

meilisearch/index.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,39 @@ def add_documents(self, documents, primary_key=None):
329329
url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}'
330330
return self.http.post(url, documents)
331331

332+
def add_documents_in_batches(self, documents, batch_size=1000, primary_key=None):
333+
"""Add documents to the index in batches.
334+
335+
Parameters
336+
----------
337+
documents: list
338+
List of documents. Each document should be a dictionary.
339+
batch_size (optional): int
340+
The number of documents that should be included in each batch. Default = 1000
341+
primary_key (optional): string
342+
The primary-key used in index. Ignored if already set up.
343+
344+
Returns
345+
-------
346+
update: list[dict]
347+
List of dictionaries containing an update ids to track the action:
348+
https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status
349+
350+
Raises
351+
------
352+
MeiliSearchApiError
353+
An error containing details about why MeiliSearch can't process your request.
354+
MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
355+
"""
356+
357+
update_ids = []
358+
359+
for document_batch in self._batch(documents, batch_size):
360+
update_id = self.add_documents(document_batch, primary_key)
361+
update_ids.append(update_id)
362+
363+
return update_ids
364+
332365
def update_documents(self, documents, primary_key=None):
333366
"""Update documents in the index.
334367
@@ -357,6 +390,38 @@ def update_documents(self, documents, primary_key=None):
357390
url = f'{self.config.paths.index}/{self.uid}/{self.config.paths.document}?{primary_key}'
358391
return self.http.put(url, documents)
359392

393+
def update_documents_in_batches(self, documents, batch_size=1000, primary_key=None):
394+
"""Update documents to the index in batches.
395+
396+
Parameters
397+
----------
398+
documents: list
399+
List of documents. Each document should be a dictionary.
400+
batch_size (optional): int
401+
The number of documents that should be included in each batch. Default = 1000
402+
primary_key (optional): string
403+
The primary-key used in index. Ignored if already set up.
404+
405+
Returns
406+
-------
407+
update: list[dict]
408+
List of dictionaries containing an update ids to track the action:
409+
https://docs.meilisearch.com/reference/api/updates.html#get-an-update-status
410+
411+
Raises
412+
------
413+
MeiliSearchApiError
414+
An error containing details about why MeiliSearch can't process your request.
415+
MeiliSearch error codes are described here: https://docs.meilisearch.com/errors/#meilisearch-errors
416+
"""
417+
418+
update_ids = []
419+
420+
for document_batch in self._batch(documents, batch_size):
421+
update_id = self.update_documents(document_batch, primary_key)
422+
update_ids.append(update_id)
423+
424+
return update_ids
360425

361426
def delete_document(self, document_id):
362427
"""Delete one document from the index.
@@ -935,5 +1000,11 @@ def reset_attributes_for_faceting(self):
9351000
self.__settings_url_for(self.config.paths.attributes_for_faceting),
9361001
)
9371002

1003+
@staticmethod
1004+
def _batch(documents, batch_size):
1005+
total_len = len(documents)
1006+
for i in range(0, total_len, batch_size):
1007+
yield documents[i : i + batch_size]
1008+
9381009
def __settings_url_for(self, sub_route):
9391010
return f'{self.config.paths.index}/{self.uid}/{self.config.paths.setting}/{sub_route}'

meilisearch/tests/index/test_index_document_meilisearch.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# pylint: disable=invalid-name
22

3+
from math import ceil
4+
35
import pytest
46

57
def test_get_documents_default(empty_index):
@@ -18,6 +20,28 @@ def test_add_documents(empty_index, small_movies):
1820
assert index.get_primary_key() == 'id'
1921
assert update['status'] == 'processed'
2022

23+
@pytest.mark.parametrize("batch_size", [2, 3, 1000])
24+
@pytest.mark.parametrize(
25+
"primary_key, expected_primary_key", [("release_date", "release_date"), (None, "id")]
26+
)
27+
def test_add_documents_in_batches(
28+
batch_size,
29+
primary_key,
30+
expected_primary_key,
31+
empty_index,
32+
small_movies,
33+
):
34+
index = empty_index()
35+
response = index.add_documents_in_batches(small_movies, batch_size, primary_key)
36+
assert ceil(len(small_movies) / batch_size) == len(response)
37+
38+
for r in response:
39+
assert "updateId" in r
40+
update = index.wait_for_pending_update(r["updateId"])
41+
assert update["status"] == "processed"
42+
43+
assert index.get_primary_key() == expected_primary_key
44+
2145
def test_get_document(index_with_documents):
2246
"""Tests getting one document from a populated index."""
2347
response = index_with_documents().get_document('500682')
@@ -66,6 +90,28 @@ def test_update_documents(index_with_documents, small_movies):
6690
response = index.get_documents()
6791
assert response[0]['title'] != 'Some title'
6892

93+
@pytest.mark.parametrize("batch_size", [2, 3, 1000])
94+
@pytest.mark.parametrize(
95+
"primary_key, expected_primary_key", [("release_date", "release_date"), (None, "id")]
96+
)
97+
def test_update_documents_in_batches(
98+
batch_size,
99+
primary_key,
100+
expected_primary_key,
101+
empty_index,
102+
small_movies,
103+
):
104+
index = empty_index()
105+
response = index.update_documents_in_batches(small_movies, batch_size, primary_key)
106+
assert ceil(len(small_movies) / batch_size) == len(response)
107+
108+
for r in response:
109+
assert "updateId" in r
110+
update = index.wait_for_pending_update(r["updateId"])
111+
assert update["status"] == "processed"
112+
113+
assert index.get_primary_key() == expected_primary_key
114+
69115
def test_delete_document(index_with_documents):
70116
"""Tests deleting a single document."""
71117
index = index_with_documents()

0 commit comments

Comments
 (0)