CogStack · mart-r · May 22, 2024 · May 13, 2024 · May 13, 2024 · May 14, 2024
diff --git a/medcat/cat.py b/medcat/cat.py
@@ -1526,6 +1526,11 @@ def multiprocessing_batch_docs_size(self,
 
         This method batches the data based on the number of documents as specified by the user.
 
+        NOTE: When providing a generator for `data`, the generator is evaluated (`list(in_data)`)
+              and thus all the data is kept in memory and (potentially) duplicated for use in
+              multiple threads. So if you're using a lot of data, it may be better to use
+              `CAT.multiprocessing_batch_char_size` instead.
+
         PS:
         This method supports Windows.
 
@@ -1550,6 +1555,8 @@ def multiprocessing_batch_docs_size(self,
         if nproc == 0:
             raise ValueError("nproc cannot be set to zero")
 
+        # TODO: Surely there's a way to not materialise all of the incoming data in memory?
+        #       This is counter productive for allowing the passing of generators.
         in_data = list(in_data) if isinstance(in_data, Iterable) else in_data
         n_process = nproc if nproc is not None else min(max(cpu_count() - 1, 1), math.ceil(len(in_data) / batch_factor))
         batch_size = batch_size if batch_size is not None else math.ceil(len(in_data) / (batch_factor * abs(n_process)))