From 56945c57e596f9e02e17fb694aaeb16de32354bb Mon Sep 17 00:00:00 2001 From: bendichter Date: Sun, 30 Jul 2023 19:36:33 -0400 Subject: [PATCH 1/4] change chunk default size to 10MB --- src/hdmf/data_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index dfe552e8c..efa55bdd1 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -187,9 +187,8 @@ def __init__(self, **kwargs): Advanced users are offered full control over the shape parameters for the buffer and the chunks; however, the chunk shape must perfectly divide the buffer shape along each axis. - HDF5 also recommends not setting chunk_mb greater than 1 MB for optimal caching speeds. - See https://support.hdfgroup.org/HDF5/doc/TechNotes/TechNote-HDF5-ImprovingIOPerformanceCompressedDatasets.pdf - for more details. + HDF5 recommends chunk size in the range of 2 to 16 MB for optimal cloud performance. + https://youtu.be/rcS5vt-mKok?t=621 """ buffer_gb, buffer_shape, chunk_mb, chunk_shape, self.display_progress, self.progress_bar_options = getargs( "buffer_gb", "buffer_shape", "chunk_mb", "chunk_shape", "display_progress", "progress_bar_options", kwargs @@ -198,7 +197,7 @@ def __init__(self, **kwargs): if buffer_gb is None and buffer_shape is None: buffer_gb = 1.0 if chunk_mb is None and chunk_shape is None: - chunk_mb = 1.0 + chunk_mb = 10.0 assert (buffer_gb is not None) != ( buffer_shape is not None ), "Only one of 'buffer_gb' or 'buffer_shape' can be specified!" From 29dcb5e8890f191382a29a9ebefa96b4036f9699 Mon Sep 17 00:00:00 2001 From: bendichter Date: Sun, 30 Jul 2023 19:38:14 -0400 Subject: [PATCH 2/4] update arg doc --- src/hdmf/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index efa55bdd1..2df001952 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -154,7 +154,7 @@ class GenericDataChunkIterator(AbstractDataChunkIterator): doc=( "If chunk_shape is not specified, it will be inferred as the smallest chunk " "below the chunk_mb threshold.", - "Defaults to 1MB.", + "Defaults to 10MB.", ), default=None, ), From 0c1b5c3c6fd3a757e169f6e58ac02276485c4081 Mon Sep 17 00:00:00 2001 From: bendichter Date: Mon, 31 Jul 2023 15:53:37 -0400 Subject: [PATCH 3/4] fix tests --- .../utils_test/test_core_GenericDataChunkIterator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py index 7df2eac39..39a57d75c 100644 --- a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py +++ b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py @@ -277,7 +277,7 @@ def test_numpy_array_chunk_iterator(self): def test_buffer_shape_option(self): expected_buffer_shape = (1580, 316) - iterator_options = dict(buffer_shape=expected_buffer_shape) + iterator_options = dict(buffer_shape=expected_buffer_shape, chunk_mb=1.0) self.check_first_data_chunk_call( expected_selection=tuple([slice(0, buffer_shape_axis) for buffer_shape_axis in expected_buffer_shape]), iterator_options=iterator_options, @@ -285,9 +285,9 @@ def test_buffer_shape_option(self): self.check_direct_hdf5_write(iterator_options=iterator_options) def test_buffer_gb_option(self): - # buffer is smaller than default chunk; should collapse to chunk shape + # buffer is smaller than chunk; should collapse to chunk shape resulting_buffer_shape = (1580, 316) - iterator_options = dict(buffer_gb=0.0005) + iterator_options = dict(buffer_gb=0.0005, chunk_mb=1.0) self.check_first_data_chunk_call( expected_selection=tuple( [ @@ -334,14 +334,14 @@ def test_chunk_mb_option_while_condition(self): """Test to evoke while condition of default shaping method.""" expected_chunk_shape = (2, 79, 79) special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(2, 2000, 2000), dtype="int16") - iterator = self.TestNumpyArrayDataChunkIterator(array=special_array) + iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0) self.assertEqual(iterator.chunk_shape, expected_chunk_shape) def test_chunk_mb_option_while_condition_unit_maxshape_axis(self): """Test to evoke while condition of default shaping method.""" expected_chunk_shape = (1, 79, 79) special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(1, 2000, 2000), dtype="int16") - iterator = self.TestNumpyArrayDataChunkIterator(array=special_array) + iterator = self.TestNumpyArrayDataChunkIterator(array=special_array, chunk_mb=1.0) self.assertEqual(iterator.chunk_shape, expected_chunk_shape) @unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed") From 9e84f535b698e1eaadd3b6c3c3aab545b16b534f Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Mon, 7 Aug 2023 01:44:59 -0700 Subject: [PATCH 4/4] Update h5tools.py --- src/hdmf/backends/hdf5/h5tools.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index b331559bf..63d6c955a 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -29,6 +29,8 @@ H5_REF = special_dtype(ref=Reference) H5_REGREF = special_dtype(ref=RegionReference) +RDCC_NBYTES = 32*2**20 # set raw data chunk cache size = 32 MiB + H5PY_3 = h5py.__version__.startswith('3') @@ -745,7 +747,7 @@ def __read_ref(self, h5obj): def open(self): if self.__file is None: open_flag = self.__mode - kwargs = dict() + kwargs = dict(rdcc_nbytes=RDCC_NBYTES) if self.comm: kwargs.update(driver='mpio', comm=self.comm)