Clarify documentation of DataChunkIterator (#813)

oruebel · rly · web-flow · commit ad554774cda5 · 2023-01-12T16:05:26.000-08:00
* Fix #623 Clarify documentation of DataChunkIterator * Update CHANGELOG.md Co-authored-by: Ryan Ly <rly@lbl.gov>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@
 - Changed the name of `ExternalResources.export_to_sqlite` to `ExternalResources.to_sqlite`. @mavaylon [#799](https://github.com/hdmf-dev/hdmf/pull/799)
 - Updated the tutorial for `ExternalResources`. @mavaylon [#799](https://github.com/hdmf-dev/hdmf/pull/799)
 - Added `message` argument for assert methods defined by `hdmf.testing.TestCase` to allow developers to include custom error messages with asserts. @oruebel [#812](https://github.com/hdmf-dev/hdmf/pull/812)
+- Clarify the expected chunk shape behavior for `DataChunkIterator`. @oruebel [#813](https://github.com/hdmf-dev/hdmf/pull/813)
 
 ## HDMF 3.4.7 (November 9, 2022)
 
diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py
@@ -426,6 +426,16 @@ class DataChunkIterator(AbstractDataChunkIterator):
     i.e., multiple values from the input iterator can be combined to a single chunk. This is
     useful for buffered I/O operations, e.g., to improve performance by accumulating data
     in memory and writing larger blocks at once.
+
+    .. note::
+
+         DataChunkIterator assumes that the iterator that it wraps returns one element along the
+         iteration dimension at a time. I.e., the iterator is expected to return chunks that are
+         one dimension lower than the array itself. For example, when iterating over the first dimension
+         of a dataset with shape (1000, 10, 10), then the iterator would return 1000 chunks of
+         shape (10, 10) one-chunk-at-a-time. If this pattern does not match your use-case then
+         using :py:class:`~hdmf.data_utils.GenericDataChunkIterator` or
+         :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more appropriate.
     """
 
     __docval_init = (
@@ -585,10 +595,13 @@ def _read_next_chunk(self):
         return self.__next_chunk
 
     def __next__(self):
-        r"""Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved.
+        """
+        Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved.
 
-        HINT: numpy.s\_ provides a convenient way to generate index tuples using standard array slicing. This
-        is often useful to define the DataChunk.selection of the current chunk
+        .. tip::
+
+            :py:attr:`numpy.s_` provides a convenient way to generate index tuples using standard array slicing. This
+            is often useful to define the DataChunk.selection of the current chunk
 
         :returns: DataChunk object with the data and selection of the current chunk
         :rtype: DataChunk
@@ -639,11 +652,19 @@ def recommended_data_shape(self):
     @property
     def maxshape(self):
         """
-        Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator. If an iterator
-        is provided and no data has been read yet, then the first chunk will be read (i.e., next will be called on the
-        iterator) in order to determine the maxshape.
+        Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator.
+
+        .. note::
+
+            If an iterator is provided and no data has been read yet, then the first chunk will be read
+            (i.e., next will be called on the iterator) in order to determine the maxshape. The iterator
+            is expected to return single chunks along the iterator dimension, this means that maxshape will
+            add an additional dimension along the iteration dimension. E.g., if we iterate over
+            the first dimension and the iterator returns chunks of shape (10, 10), then the maxshape would
+            be (None, 10, 10) or (len(self.data), 10, 10), depending on whether size of the
+            iteration dimension is known.
 
-        :return: Shape tuple. None is used for dimenwions where the maximum shape is not known or unlimited.
+        :return: Shape tuple. None is used for dimensions where the maximum shape is not known or unlimited.
         """
         if self.__maxshape is None:
             # If no data has been read from the iterator yet, read the first chunk and use it to determine the maxshape