diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 1ea5e676b9..16212e8b77 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -1,10 +1,6 @@ Storage (``zarr.storage``) ========================== -.. module:: zarr.storage - -This module contains storage classes for use with Zarr arrays and groups. -However, note that any object implementing the ``MutableMapping`` interface -can be used as a Zarr array store. +.. automodule:: zarr.storage .. autofunction:: init_array .. autofunction:: init_group @@ -12,6 +8,7 @@ can be used as a Zarr array store. .. autoclass:: DictStore .. autoclass:: DirectoryStore .. autoclass:: TempStore +.. autoclass:: NestedDirectoryStore .. autoclass:: ZipStore .. automethod:: close diff --git a/tox.ini b/tox.ini index 96bff14383..47ac10965b 100644 --- a/tox.ini +++ b/tox.ini @@ -13,6 +13,7 @@ setenv = py34,py35,py36: PY_MAJOR_VERSION = py3 py27: PY_MAJOR_VERSION = py2 commands = + python -c 'import glob; import shutil; import os; [(shutil.rmtree(d) if os.path.isdir(d) else os.remove(d) if os.path.isfile(d) else None) for d in glob.glob("./example*")]' py27,py34,py35: nosetests -v --with-coverage --cover-erase --cover-package=zarr zarr py36: nosetests -v --with-coverage --cover-erase --cover-package=zarr --with-doctest --doctest-options=+NORMALIZE_WHITESPACE,+ELLIPSIS zarr py36: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst diff --git a/zarr/__init__.py b/zarr/__init__.py index 8637cb02ff..7ec9e8f762 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -4,9 +4,9 @@ from zarr.core import Array -from zarr.creation import empty, zeros, ones, full, array, empty_like, \ - zeros_like, ones_like, full_like, open, open_array, open_like, create -from zarr.storage import DictStore, DirectoryStore, ZipStore, TempStore +from zarr.creation import empty, zeros, ones, full, array, empty_like, zeros_like, ones_like, \ + full_like, open, open_array, open_like, create +from zarr.storage import DictStore, DirectoryStore, ZipStore, TempStore, NestedDirectoryStore from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * diff --git a/zarr/storage.py b/zarr/storage.py index af6df563ed..233022e19a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1,4 +1,9 @@ # -*- coding: utf-8 -*- +""" +This module contains storage classes for use with Zarr arrays and groups. Note that any object +implementing the ``MutableMapping`` interface can be used as a Zarr array store. + +""" from __future__ import absolute_import, print_function, division from collections import MutableMapping import os @@ -7,6 +12,7 @@ import zipfile import shutil import atexit +import re import numpy as np @@ -26,6 +32,7 @@ group_meta_key = '.zgroup' attrs_key = '.zattrs' try: + # noinspection PyUnresolvedReferences from zarr.codecs import Blosc default_compressor = Blosc() except ImportError: # pragma: no cover @@ -376,8 +383,10 @@ def ensure_bytes(s): return s if isinstance(s, np.ndarray): if PY2: # pragma: py3 no cover + # noinspection PyArgumentList return s.tostring(order='Any') else: # pragma: py2 no cover + # noinspection PyArgumentList return s.tobytes(order='Any') if hasattr(s, 'tobytes'): return s.tobytes() @@ -535,14 +544,13 @@ def getsize(self, path=None): path = normalize_storage_path(path) # obtain value to return size of + value = self.root if path: try: parent, key = self._get_parent(path) value = parent[key] except KeyError: err_path_not_found(path) - else: - value = self.root # obtain size of value if isinstance(value, self.cls): @@ -691,11 +699,15 @@ def __iter__(self): def __len__(self): return sum(1 for _ in self.keys()) - def listdir(self, path=None): + def dir_path(self, path=None): store_path = normalize_storage_path(path) dir_path = self.path if store_path: dir_path = os.path.join(dir_path, store_path) + return dir_path + + def listdir(self, path=None): + dir_path = self.dir_path(path) if os.path.isdir(dir_path): return sorted(os.listdir(dir_path)) else: @@ -739,12 +751,126 @@ def atexit_rmtree(path, class TempStore(DirectoryStore): """Directory store using a temporary directory for storage.""" + # noinspection PyShadowingBuiltins def __init__(self, suffix='', prefix='zarr', dir=None): path = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir) atexit.register(atexit_rmtree, path) super(TempStore, self).__init__(path) +_prog_ckey = re.compile(r'^(\d+)(\.\d+)+$') +_prog_number = re.compile(r'^\d+$') + + +def _map_ckey(key): + segments = list(key.split('/')) + if segments: + last_segment = segments[-1] + if _prog_ckey.match(last_segment): + last_segment = last_segment.replace('.', '/') + segments = segments[:-1] + [last_segment] + key = '/'.join(segments) + return key + + +class NestedDirectoryStore(DirectoryStore): + """Mutable Mapping interface to a directory, with special handling for chunk keys so that + chunk files for multidimensional arrays are stored in a nested directory tree. Keys must be + strings, values must be bytes-like objects. + + Parameters + ---------- + path : string + Location of directory. + + Examples + -------- + Most keys are mapped to file paths as normal, e.g.:: + + >>> import zarr + >>> store = zarr.NestedDirectoryStore('example_nested_store') + >>> store['foo'] = b'bar' + >>> store['foo'] + b'bar' + >>> store['a/b/c'] = b'xxx' + >>> store['a/b/c'] + b'xxx' + >>> open('example_nested_store/foo', 'rb').read() + b'bar' + >>> open('example_nested_store/a/b/c', 'rb').read() + b'xxx' + + Chunk keys are handled in a special way, such that the '.' characters in the key are mapped to + directory path separators internally. E.g.:: + + >>> store['bar/0.0'] = b'yyy' + >>> store['bar/0.0'] + b'yyy' + >>> store['baz/2.1.12'] = b'zzz' + >>> store['baz/2.1.12'] + b'zzz' + >>> open('example_nested_store/bar/0/0', 'rb').read() + b'yyy' + >>> open('example_nested_store/baz/2/1/12', 'rb').read() + b'zzz' + + Notes + ----- + The standard DirectoryStore class stores all chunk files for an array together in a single + directory. On some file systems the potentially large number of files in a single directory + can cause performance issues. The NestedDirectoryStore class provides an alternative where + chunk files for multidimensional arrays will be organised into a directory hierarchy, + thus reducing the number of files in any one directory. + + """ + + def __init__(self, path): + super(NestedDirectoryStore, self).__init__(path) + + def __getitem__(self, key): + key = _map_ckey(key) + return super(NestedDirectoryStore, self).__getitem__(key) + + def __setitem__(self, key, value): + key = _map_ckey(key) + super(NestedDirectoryStore, self).__setitem__(key, value) + + def __delitem__(self, key): + key = _map_ckey(key) + super(NestedDirectoryStore, self).__delitem__(key) + + def __contains__(self, key): + key = _map_ckey(key) + return super(NestedDirectoryStore, self).__contains__(key) + + def __eq__(self, other): + return ( + isinstance(other, NestedDirectoryStore) and + self.path == other.path + ) + + def listdir(self, path=None): + children = super(NestedDirectoryStore, self).listdir(path=path) + if array_meta_key in children: + # special handling of directories containing an array to map nested chunk keys back + # to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and os.path.isdir(entry_path): + for dir_path, _, file_names in os.walk(entry_path): + for file_name in file_names: + file_path = os.path.join(dir_path, file_name) + rel_path = file_path.split(root_path + os.path.sep)[1] + new_children.append(rel_path.replace(os.path.sep, '.')) + else: + new_children.append(entry) + return sorted(new_children) + else: + return children + + # noinspection PyPep8Naming class ZipStore(MutableMapping): """Mutable Mapping interface to a Zip file. Keys must be strings, diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 89c1c31f88..d29cb681e5 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -13,7 +13,7 @@ assert_raises, assert_true, assert_false, assert_is, assert_is_none -from zarr.storage import DirectoryStore, init_array, init_group +from zarr.storage import DirectoryStore, init_array, init_group, NestedDirectoryStore from zarr.core import Array from zarr.errors import PermissionError from zarr.compat import PY2 @@ -713,6 +713,18 @@ def test_nbytes_stored(self): eq(expect_nbytes_stored, z.nbytes_stored) +class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): + + @staticmethod + def create_array(read_only=False, **kwargs): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = NestedDirectoryStore(path) + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + class TestArrayWithNoCompressor(TestArray): def create_array(self, read_only=False, **kwargs): diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index ae7adf380b..3e026c63f3 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -15,7 +15,7 @@ from zarr.storage import DictStore, DirectoryStore, ZipStore, init_group, \ - init_array, attrs_key, array_meta_key, group_meta_key, atexit_rmtree + init_array, attrs_key, array_meta_key, group_meta_key, atexit_rmtree, NestedDirectoryStore from zarr.core import Array from zarr.hierarchy import Group, group, open_group from zarr.attrs import Attributes @@ -791,6 +791,16 @@ def create_store(): return store, None +class TestGroupWithNestedDirectoryStore(TestGroup): + + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStore(path) + return store, None + + class TestGroupWithZipStore(TestGroup): @staticmethod diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 4182c358ac..76585dce2b 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -17,11 +17,10 @@ from zarr.storage import init_array, array_meta_key, attrs_key, DictStore, \ DirectoryStore, ZipStore, init_group, group_meta_key, getsize, \ - migrate_1to2, TempStore, atexit_rmtree + migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor from zarr.meta import decode_array_metadata, encode_array_metadata, \ ZARR_FORMAT, decode_group_metadata, encode_group_metadata from zarr.compat import text_type -from zarr.storage import default_compressor from zarr.codecs import Zlib, Blosc, BZ2 from zarr.errors import PermissionError from zarr.hierarchy import group @@ -616,6 +615,27 @@ def test_setdel(self): setdel_hierarchy_checks(store) +class TestNestedDirectoryStore(TestDirectoryStore, unittest.TestCase): + + def create_store(self): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStore(path) + return store + + def test_chunk_nesting(self): + store = self.create_store() + # any path where last segment looks like a chunk key gets special handling + store['0.0'] = b'xxx' + eq(b'xxx', store['0.0']) + eq(b'xxx', store['0/0']) + store['foo/10.20.30'] = b'yyy' + eq(b'yyy', store['foo/10.20.30']) + eq(b'yyy', store['foo/10/20/30']) + store['42'] = b'zzz' + eq(b'zzz', store['42']) + + class TestTempStore(StoreTests, unittest.TestCase): def create_store(self):