diff --git a/CHANGELOG.md b/CHANGELOG.md index ec0ec2157..ca9a1fe23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # HDMF Changelog -## HDMF 3.0.0 (Upcoming) +## HDMF 3.0.0 (July 3, 2021) ### New features - Add support for Python 3.9, drop support for Python 3.6. @rly (#620) @@ -15,26 +15,23 @@ `h5py.Dataset` object, and `d[:]` returns `str` objects. Under HDMF 3.x, the same dataset `d` is read as a `hdmf.utils.StrDataset` object and `d[:]` still returns `str` objects. - Add RRID to docs. @oruebel (#633) -- +- Allow passing ``index=True`` to ``DynamicTable.to_dataframe()`` to support returning `DynamicTableRegion`` columns as indices or Pandas DataFrame. @rly (#579) +- Updated external resources tutorial. @mavaylon (#611) ### Breaking changes and deprecations -- -- -- -- - -### Internal improvements -- -- -- -- +- Previously, when using ``DynamicTable.__getitem__`` or ``DynamicTable.get`` to access a selection of a + ``DynamicTable`` containing a ``DynamicTableRegion``, new columns with mangled names for the table data referred to + by the ``DynamicTableRegion`` were added to the returned DataFrame. This did not work properly for ragged + ``DynamicTableRegion``, multiple levels of nesting, or multiple rows returned. + Now, these methods will by default return columns of indices of the ``DynamicTableRegion``. If ``index=False`` is + passed to ``DynamicTable.get``, then nested DataFrames will be returned, one DataFrame per row of the original + resulting DataFrame. @rly (#579) ### Bug fixes -- Update the validator to allow extensions to data types which only define data_type_inc @dsleiter (#609) -- Fix error when validating lazy-loaded datasets containing references @dsleiter (#609) -- -- -- +- Update the validator to allow extensions to data types which only define data_type_inc. @dsleiter (#609) +- Fix error when validating lazy-loaded datasets containing references. @dsleiter (#609) +- Fix error when using ``DynamicTable.__getitem__`` or ``DynamicTable.get`` when table has a ragged + ``DynamicTableRegion``. @rly (#579) ## HDMF 2.5.8 (June 16, 2021) diff --git a/requirements-min.txt b/requirements-min.txt index 0497d3af6..1c5d5609e 100644 --- a/requirements-min.txt +++ b/requirements-min.txt @@ -1,5 +1,5 @@ # minimum versions of package dependencies for installing HDMF -h5py==2.9 # support for setting attrs to lists of utf-8 added in 2.9 +h5py==2.10 # support for selection of datasets with list of indices added in 2.10 numpy==1.16 scipy==1.1 pandas==1.0.5 diff --git a/setup.py b/setup.py index d5dbc8255..80619c1c6 100755 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ schema_dir = 'common/hdmf-common-schema/common' reqs = [ - 'h5py>=2.9,<4', + 'h5py>=2.10,<4', 'numpy>=1.16,<1.21', 'scipy>=1.1,<2', 'pandas>=1.0.5,<2', diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index edadc4093..862197cb0 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd -from h5py import Dataset from . import register_class, EXP_NAMESPACE from ..container import Container, Data @@ -157,7 +156,7 @@ def __getitem_helper(self, arg, **kwargs): def __getitem__(self, arg): """ - Select elements in this VectorIndex and retrieve the corrsponding data from the self.target VectorData + Select elements in this VectorIndex and retrieve the corresponding data from the self.target VectorData :param arg: slice or integer index indicating the elements we want to select in this VectorIndex :return: Scalar or list of values retrieved @@ -791,25 +790,39 @@ def __getitem__(self, key): raise KeyError(key) return ret - def get(self, key, default=None, df=True, **kwargs): # noqa: C901 - """ - Select a subset from the table + def get(self, key, default=None, df=True, index=True, **kwargs): + """Select a subset from the table. + + If the table includes a DynamicTableRegion column, then by default, + the index/indices of the DynamicTableRegion will be returned. If ``df=True`` and ``index=False``, + then the returned pandas DataFrame will contain a nested DataFrame in each row of the + DynamicTableRegion column. If ``df=False`` and ``index=True``, then a list of lists will be returned + where the list containing the DynamicTableRegion column contains the indices of the DynamicTableRegion. + Note that in this case, the DynamicTable referenced by the DynamicTableRegion can be accessed through + the ``table`` attribute of the DynamicTableRegion object. ``df=False`` and ``index=False`` is + not yet supported. :param key: Key defining which elements of the table to select. This may be one of the following: 1) string with the name of the column to select 2) a tuple consisting of (str, int) where the string identifies the column to select by name and the int selects the row - 3) int, list of ints, or slice selecting a set of full rows in the table + 3) int, list of ints, array, or slice selecting a set of full rows in the table. If an int is used, then + scalars are returned for each column that has a single value. If a list, array, or slice is used and + df=False, then lists are returned for each column, even if the list, array, or slice resolves to a + single row. :return: 1) If key is a string, then return array with the data of the selected column 2) If key is a tuple of (int, str), then return the scalar value of the selected cell - 3) If key is an int, list, np.ndarray, or slice, then return pandas.DataFrame consisting of one or - more rows + 3) If key is an int, list, np.ndarray, or slice, then return pandas.DataFrame or lists + consisting of one or more rows :raises: KeyError """ ret = None + if not df and not index: + # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported + raise ValueError('DynamicTable.get() with df=False and index=False is not yet supported.') if isinstance(key, tuple): # index by row and column --> return specific cell arg1 = key[0] @@ -828,110 +841,143 @@ def get(self, key, default=None, df=True, **kwargs): # noqa: C901 else: return default else: - # index by int, list, np.ndarray, or slice --> return pandas Dataframe consisting of one or more rows - arg = key - ret = OrderedDict() - try: - # index with a python slice or single int to select one or multiple rows - if not (np.issubdtype(type(arg), np.integer) or isinstance(arg, (slice, list, np.ndarray))): - raise KeyError("Key type not supported by DynamicTable %s" % str(type(arg))) - if isinstance(arg, np.ndarray) and len(arg.shape) != 1: - raise ValueError("cannot index DynamicTable with multiple dimensions") - ret['id'] = self.id[arg] - for name in self.colnames: - col = self.__df_cols[self.__colids[name]] - ret[name] = col.get(arg, df=df, **kwargs) - except ValueError as ve: - # in h5py <2, this was a ValueError. in h5py 3+, this became an IndexError - x = re.match(r"^Index \((.*)\) out of range \(.*\)$", str(ve)) - if x: - msg = ("Row index %s out of range for %s '%s' (length %d)." - % (x.groups()[0], self.__class__.__name__, self.name, len(self))) - raise IndexError(msg) - else: # pragma: no cover - raise ve - except IndexError as ie: - x = re.match(r"^Index \((.*)\) out of range for \(.*\)$", str(ie)) - if x: - msg = ("Row index %s out of range for %s '%s' (length %d)." - % (x.groups()[0], self.__class__.__name__, self.name, len(self))) - raise IndexError(msg) - elif str(ie) == 'list index out of range': - msg = ("Row index out of range for %s '%s' (length %d)." - % (self.__class__.__name__, self.name, len(self))) - raise IndexError(msg) - else: # pragma: no cover - raise ie + # index by int, list, np.ndarray, or slice --> + # return pandas Dataframe or lists consisting of one or more rows + sel = self.__get_selection_as_dict(key, df, index, **kwargs) if df: # reformat objects to fit into a pandas DataFrame - id_index = ret.pop('id') - if np.isscalar(id_index): - id_index = [id_index] - retdf = OrderedDict() - for k in ret: # for each column - if isinstance(ret[k], np.ndarray): - if ret[k].ndim == 1: - if len(id_index) == 1: - # k is a multi-dimension column, and - # only one element has been selected - retdf[k] = [ret[k]] - else: - retdf[k] = ret[k] - else: - if len(id_index) == ret[k].shape[0]: - # k is a multi-dimension column, and - # more than one element has been selected - retdf[k] = list(ret[k]) - else: - raise ValueError('unable to convert selection to DataFrame') - elif isinstance(ret[k], (list, tuple)): - if len(id_index) == 1: - # k is a multi-dimension column, and - # only one element has been selected - retdf[k] = [ret[k]] - else: - retdf[k] = ret[k] - elif isinstance(ret[k], pd.DataFrame): - retdf['%s_%s' % (k, ret[k].index.name)] = ret[k].index.values - for col in ret[k].columns: - newcolname = "%s_%s" % (k, col) - retdf[newcolname] = ret[k][col].values - else: - retdf[k] = ret[k] - ret = pd.DataFrame(retdf, index=pd.Index(name=self.id.name, data=id_index)) - # if isinstance(key, (int, np.integer)): - # ret = ret.iloc[0] + if np.isscalar(key): + ret = self.__get_selection_as_df_single_row(sel) + else: + ret = self.__get_selection_as_df(sel) else: - ret = list(ret.values()) + ret = list(sel.values()) return ret + def __get_selection_as_dict(self, arg, df, index, exclude=None, **kwargs): + """Return a dict mapping column names to values (lists/arrays or dataframes) for the given selection. + Uses each column's get() method, passing kwargs as necessary. + + :param arg: key passed to get() to return one or more rows + :type arg: int, list, np.ndarray, or slice + """ + if not (np.issubdtype(type(arg), np.integer) or isinstance(arg, (slice, list, np.ndarray))): + raise KeyError("Key type not supported by DynamicTable %s" % str(type(arg))) + if isinstance(arg, np.ndarray) and arg.ndim != 1: + raise ValueError("Cannot index DynamicTable with multiple dimensions") + if exclude is None: + exclude = set([]) + ret = OrderedDict() + try: + # index with a python slice or single int to select one or multiple rows + ret['id'] = self.id[arg] + for name in self.colnames: + if name in exclude: + continue + col = self.__df_cols[self.__colids[name]] + if index and (isinstance(col, DynamicTableRegion) or + (isinstance(col, VectorIndex) and isinstance(col.target, DynamicTableRegion))): + # return indices (in list, array, etc.) for DTR and ragged DTR + ret[name] = col.get(arg, df=False, index=True, **kwargs) + else: + ret[name] = col.get(arg, df=df, index=index, **kwargs) + return ret + # if index is out of range, different errors can be generated depending on the dtype of the column + # but despite the differences, raise an IndexError from that error + except ValueError as ve: + # in h5py <2, if the column is an h5py.Dataset, a ValueError was raised + # in h5py 3+, this became an IndexError + x = re.match(r"^Index \((.*)\) out of range \(.*\)$", str(ve)) + if x: + msg = ("Row index %s out of range for %s '%s' (length %d)." + % (x.groups()[0], self.__class__.__name__, self.name, len(self))) + raise IndexError(msg) from ve + else: # pragma: no cover + raise ve + except IndexError as ie: + x = re.match(r"^Index \((.*)\) out of range for \(.*\)$", str(ie)) + if x: + msg = ("Row index %s out of range for %s '%s' (length %d)." + % (x.groups()[0], self.__class__.__name__, self.name, len(self))) + raise IndexError(msg) + elif str(ie) == 'list index out of range': + msg = ("Row index out of range for %s '%s' (length %d)." + % (self.__class__.__name__, self.name, len(self))) + raise IndexError(msg) from ie + else: # pragma: no cover + raise ie + + def __get_selection_as_df_single_row(self, coldata): + """Return a pandas dataframe for the given row and columns with the id column as the index. + + This is a special case of __get_selection_as_df where a single row was requested. + + :param coldata: dict mapping column names to values (list/arrays or dataframes) + :type coldata: dict + """ + id_index_orig = coldata.pop('id') + id_index = [id_index_orig] + df_input = OrderedDict() + for k in coldata: # for each column + if isinstance(coldata[k], (np.ndarray, list, tuple, pd.DataFrame)): + # wrap in a list because coldata[k] may be an array/list/tuple with multiple elements (ragged or + # multi-dim column) and pandas needs to have one element per index row (=1 in this case) + df_input[k] = [coldata[k]] + else: # scalar, don't wrap + df_input[k] = coldata[k] + ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index)) + return ret + + def __get_selection_as_df(self, coldata): + """Return a pandas dataframe for the given rows and columns with the id column as the index. + + This is used when multiple row indices are selected (or a list/array/slice of a single index is passed to get). + __get_selection_as_df_single_row should be used if a single index is passed to get. + + :param coldata: dict mapping column names to values (list/arrays or dataframes) + :type coldata: dict + """ + id_index = coldata.pop('id') + df_input = OrderedDict() + for k in coldata: # for each column + if isinstance(coldata[k], np.ndarray) and coldata[k].ndim > 1: + df_input[k] = list(coldata[k]) # convert multi-dim array to list of inner arrays + elif isinstance(coldata[k], pd.DataFrame): + # multiple rows were selected and collapsed into a dataframe + # split up the rows of the df into a list of dataframes, one per row + # TODO make this more efficient + df_input[k] = [coldata[k].iloc[[i]] for i in range(len(coldata[k]))] + else: + df_input[k] = coldata[k] + ret = pd.DataFrame(df_input, index=pd.Index(name=self.id.name, data=id_index)) + return ret + def __contains__(self, val): """ Check if the given value (i.e., column) exists in this table """ return val in self.__colids or val in self.__indices - @docval({'name': 'exclude', 'type': set, 'doc': ' Set of columns to exclude from the dataframe', 'default': None}) + @docval({'name': 'exclude', 'type': set, 'doc': 'Set of column names to exclude from the dataframe', + 'default': None}, + {'name': 'index', 'type': bool, + 'doc': ('Whether to return indices for a DynamicTableRegion column. If False, nested dataframes will be ' + 'returned.'), + 'default': False} + ) def to_dataframe(self, **kwargs): """ Produce a pandas DataFrame containing this table's data. - """ - exclude = popargs('exclude', kwargs) - if exclude is None: - exclude = set([]) - data = OrderedDict() - for name in self.colnames: - if name in exclude: - continue - col = self.__df_cols[self.__colids[name]] - - if isinstance(col.data, (Dataset, np.ndarray)) and col.data.ndim > 1: - data[name] = [x for x in col[:]] - else: - data[name] = col[:] - return pd.DataFrame(data, index=pd.Index(name=self.id.name, data=self.id.data)) + If this table contains a DynamicTableRegion, by default, + + If exclude is None, this is equivalent to table.get(slice(None, None, None), index=False). + """ + arg = slice(None, None, None) # select all rows + sel = self.__get_selection_as_dict(arg, df=True, **kwargs) + ret = self.__get_selection_as_df(sel) + return ret @classmethod @docval( @@ -1083,11 +1129,15 @@ def get(self, arg, index=False, df=True, **kwargs): :param arg: 1) tuple consisting of (str, int) where the string defines the column to select and the int selects the row, 2) int or slice to select a subset of rows - :param df: Boolean indicating whether we want to return the result as a pandas dataframe + :param index: Boolean indicating whether to return indices of the DTR (default False) + :param df: Boolean indicating whether to return the result as a pandas DataFrame (default True) - :return: Result from self.table[....] with the appropritate selection based on the + :return: Result from self.table[....] with the appropriate selection based on the rows selected by this DynamicTableRegion """ + if not df and not index: + # returning nested lists of lists for DTRs and ragged DTRs is complicated and not yet supported + raise ValueError('DynamicTableRegion.get() with df=False and index=False is not yet supported.') # treat the list of indices as data that can be indexed. then pass the # result to the table to get the data if isinstance(arg, tuple): @@ -1099,13 +1149,13 @@ def get(self, arg, index=False, df=True, **kwargs): raise IndexError('index {} out of bounds for data of length {}'.format(arg, len(self.data))) ret = self.data[arg] if not index: - ret = self.table.get(ret, df=df, **kwargs) + ret = self.table.get(ret, df=df, index=index, **kwargs) return ret elif isinstance(arg, (list, slice, np.ndarray)): idx = arg # get the data at the specified indices - if isinstance(self.data, (tuple, list)) and isinstance(idx, list): + if isinstance(self.data, (tuple, list)) and isinstance(idx, (list, np.ndarray)): ret = [self.data[i] for i in idx] else: ret = self.data[idx] @@ -1122,7 +1172,7 @@ def get(self, arg, index=False, df=True, **kwargs): # of the list we are returning. This is carried out by the recursive method _index_lol uniq = np.unique(ret) lut = {val: i for i, val in enumerate(uniq)} - values = self.table.get(uniq, df=df, **kwargs) + values = self.table.get(uniq, df=df, index=index, **kwargs) if df: ret = values.iloc[[lut[i] for i in ret]] else: @@ -1142,8 +1192,10 @@ def _index_lol(self, result, index, lut): for col in result: if isinstance(col, list): if isinstance(col[0], list): + # list of columns that need to be sorted ret.append(self._index_lol(col, index, lut)) else: + # list of elements, one for each row to return ret.append([col[lut[i]] for i in index]) elif isinstance(col, np.ndarray): ret.append(np.array([col[lut[i]] for i in index], dtype=col.dtype)) diff --git a/src/hdmf/testing/testcase.py b/src/hdmf/testing/testcase.py index c22be7132..a20e1e119 100644 --- a/src/hdmf/testing/testcase.py +++ b/src/hdmf/testing/testcase.py @@ -113,7 +113,10 @@ def _assert_array_equal(self, arr1, arr2, ignore_hdmf_attrs=False): if isinstance(arr2, np.ndarray) and len(arr2.dtype) > 1: # compound type arr2 = arr2.tolist() if isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray): - np.testing.assert_allclose(arr1, arr2) + if np.issubdtype(arr1.dtype, np.number): + np.testing.assert_allclose(arr1, arr2) + else: + np.testing.assert_array_equal(arr1, arr2) else: for sub1, sub2 in zip(arr1, arr2): if isinstance(sub1, Container): diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index 24d71eb96..7ecc4d1de 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -1,15 +1,19 @@ from collections import OrderedDict import h5py import numpy as np +import os import pandas as pd import unittest - from hdmf import Container from hdmf.backends.hdf5 import H5DataIO, HDF5IO +from hdmf.backends.hdf5.h5tools import H5_TEXT, H5PY_3 from hdmf.common import (DynamicTable, VectorData, VectorIndex, ElementIdentifiers, EnumData, DynamicTableRegion, get_manager, SimpleMultiContainer) from hdmf.testing import TestCase, H5RoundTripMixin, remove_test_file +from hdmf.utils import StrDataset + +from tests.unit.utils import get_temp_filepath class TestDynamicTable(TestCase): @@ -565,6 +569,12 @@ def test_index_out_of_bounds(self): with self.assertRaisesWith(IndexError, msg): table[5] + def test_no_df_nested(self): + table = self.with_columns_and_data() + msg = 'DynamicTable.get() with df=False and index=False is not yet supported.' + with self.assertRaisesWith(ValueError, msg): + table.get(0, df=False, index=False) + def test_multidim_col(self): multidim_data = [ [[1, 2], [3, 4], [5, 6]], @@ -827,6 +837,13 @@ def test_repr(self): expected = expected % (id(dynamic_table_region), id(table)) self.assertEqual(str(dynamic_table_region), expected) + def test_no_df_nested(self): + table = self.with_columns_and_data() + dynamic_table_region = DynamicTableRegion('dtr', [0, 1, 2, 2], 'desc', table=table) + msg = 'DynamicTableRegion.get() with df=False and index=False is not yet supported.' + with self.assertRaisesWith(ValueError, msg): + dynamic_table_region.get(0, df=False, index=False) + class DynamicTableRegionRoundTrip(H5RoundTripMixin, TestCase): @@ -880,6 +897,11 @@ def _get(self, arg): table = mc.containers['table_with_dtr'] return table.get(arg) + def _get_nested(self, arg): + mc = self.roundtripContainer() + table = mc.containers['table_with_dtr'] + return table.get(arg, index=False) + def _get_nodf(self, arg): mc = self.roundtripContainer() table = mc.containers['table_with_dtr'] @@ -900,24 +922,58 @@ def test_getitem_badcol(self): self._getitem('boo') def _assert_two_elem_df(self, rec): - columns = ['foo', 'bar', 'baz', 'dtr_id', 'dtr_qux', 'dtr_quz'] - data = [[1, 10.0, 'cat', 0, 'qux_1', 'quz_1'], - [2, 20.0, 'dog', 1, 'qux_2', 'quz_2']] + columns = ['foo', 'bar', 'baz', 'dtr'] + data = [[1, 10.0, 'cat', 0], + [2, 20.0, 'dog', 1]] exp = pd.DataFrame(data=data, columns=columns, index=pd.Series(name='id', data=[0, 1])) pd.testing.assert_frame_equal(rec, exp, check_dtype=False) def _assert_one_elem_df(self, rec): - columns = ['foo', 'bar', 'baz', 'dtr_id', 'dtr_qux', 'dtr_quz'] - data = [[1, 10.0, 'cat', 0, 'qux_1', 'quz_1']] + columns = ['foo', 'bar', 'baz', 'dtr'] + data = [[1, 10.0, 'cat', 0]] exp = pd.DataFrame(data=data, columns=columns, index=pd.Series(name='id', data=[0])) pd.testing.assert_frame_equal(rec, exp, check_dtype=False) + def _assert_two_elem_df_nested(self, rec): + nested_columns = ['qux', 'quz'] + nested_data = [['qux_1', 'quz_1'], ['qux_2', 'quz_2']] + nested_df = pd.DataFrame(data=nested_data, columns=nested_columns, index=pd.Series(name='id', data=[0, 1])) + + columns = ['foo', 'bar', 'baz'] + data = [[1, 10.0, 'cat'], + [2, 20.0, 'dog']] + exp = pd.DataFrame(data=data, columns=columns, index=pd.Series(name='id', data=[0, 1])) + + # remove nested dataframe and test each df separately + pd.testing.assert_frame_equal(rec['dtr'][0], nested_df.iloc[[0]]) + pd.testing.assert_frame_equal(rec['dtr'][1], nested_df.iloc[[1]]) + del rec['dtr'] + pd.testing.assert_frame_equal(rec, exp, check_dtype=False) + + def _assert_one_elem_df_nested(self, rec): + nested_columns = ['qux', 'quz'] + nested_data = [['qux_1', 'quz_1'], ['qux_2', 'quz_2']] + nested_df = pd.DataFrame(data=nested_data, columns=nested_columns, index=pd.Series(name='id', data=[0, 1])) + + columns = ['foo', 'bar', 'baz'] + data = [[1, 10.0, 'cat']] + exp = pd.DataFrame(data=data, columns=columns, index=pd.Series(name='id', data=[0])) + + # remove nested dataframe and test each df separately + pd.testing.assert_frame_equal(rec['dtr'][0], nested_df.iloc[[0]]) + del rec['dtr'] + pd.testing.assert_frame_equal(rec, exp, check_dtype=False) + ##################### # tests DynamicTableRegion.__getitem__ def test_getitem_int(self): rec = self._getitem(0) self._assert_one_elem_df(rec) + def test_getitem_list_single(self): + rec = self._getitem([0]) + self._assert_one_elem_df(rec) + def test_getitem_list(self): rec = self._getitem([0, 1]) self._assert_two_elem_df(rec) @@ -932,6 +988,10 @@ def test_get_int(self): rec = self._get(0) self._assert_one_elem_df(rec) + def test_get_list_single(self): + rec = self._get([0]) + self._assert_one_elem_df(rec) + def test_get_list(self): rec = self._get([0, 1]) self._assert_two_elem_df(rec) @@ -940,11 +1000,29 @@ def test_get_slice(self): rec = self._get(slice(0, 2, None)) self._assert_two_elem_df(rec) + ##################### + # tests DynamicTableRegion.get, return a DataFrame with nested DataFrame + def test_get_nested_int(self): + rec = self._get_nested(0) + self._assert_one_elem_df_nested(rec) + + def test_get_nested_list_single(self): + rec = self._get_nested([0]) + self._assert_one_elem_df_nested(rec) + + def test_get_nested_list(self): + rec = self._get_nested([0, 1]) + self._assert_two_elem_df_nested(rec) + + def test_get_nested_slice(self): + rec = self._get_nested(slice(0, 2, None)) + self._assert_two_elem_df_nested(rec) + ##################### # tests DynamicTableRegion.get, DO NOT return a DataFrame def test_get_nodf_int(self): rec = self._get_nodf(0) - exp = [0, 1, 10.0, 'cat', [0, 'qux_1', 'quz_1']] + exp = [0, 1, 10.0, 'cat', 0] self.assertListEqual(rec, exp) def _assert_list_of_ndarray_equal(self, l1, l2): @@ -958,16 +1036,19 @@ def _assert_list_of_ndarray_equal(self, l1, l2): else: np.testing.assert_array_equal(a1, a2) + def test_get_nodf_list_single(self): + rec = self._get_nodf([0]) + exp = [np.array([0]), np.array([1]), np.array([10.0]), np.array(['cat']), np.array([0])] + self._assert_list_of_ndarray_equal(exp, rec) + def test_get_nodf_list(self): rec = self._get_nodf([0, 1]) - exp = [np.array([0, 1]), np.array([1, 2]), np.array([10.0, 20.0]), np.array(['cat', 'dog']), - [np.array([0, 1]), np.array(['qux_1', 'qux_2']), np.array(['quz_1', 'quz_2'])]] + exp = [np.array([0, 1]), np.array([1, 2]), np.array([10.0, 20.0]), np.array(['cat', 'dog']), np.array([0, 1])] self._assert_list_of_ndarray_equal(exp, rec) def test_get_nodf_slice(self): rec = self._get_nodf(slice(0, 2, None)) - exp = [np.array([0, 1]), np.array([1, 2]), np.array([10.0, 20.0]), np.array(['cat', 'dog']), - [np.array([0, 1]), np.array(['qux_1', 'qux_2']), np.array(['quz_1', 'quz_2'])]] + exp = [np.array([0, 1]), np.array([1, 2]), np.array([10.0, 20.0]), np.array(['cat', 'dog']), np.array([0, 1])] self._assert_list_of_ndarray_equal(exp, rec) @@ -1366,74 +1447,420 @@ def test_add_2d_row_index(self): np.testing.assert_array_equal(idx[2], [['c', 'c'], ['c', 'c'], ['c', 'c'], ['c', 'c']]) -class TestIndexing(TestCase): +class SelectionTestMixin: def setUp(self): - dt = DynamicTable(name='slice_test_table', description='a table to test slicing', - id=[0, 1, 2]) - dt.add_column('foo', 'scalar column', data=np.array([0.0, 1.0, 2.0])) - dt.add_column('bar', 'ragged column', index=np.array([2, 3, 6]), - data=np.array(['r11', 'r12', 'r21', 'r31', 'r32', 'r33'])) - dt.add_column('baz', 'multi-dimension column', - data=np.array([[10.0, 11.0, 12.0], - [20.0, 21.0, 22.0], - [30.0, 31.0, 32.0]])) - self.table = dt + # table1 contains a non-ragged DTR and a ragged DTR, both of which point to table2 + # table2 contains a non-ragged DTR and a ragged DTR, both of which point to table3 + self.table3 = DynamicTable( + name='table3', + description='a test table', + id=[20, 21, 22] + ) + self.table3.add_column('foo', 'scalar column', data=self._wrap([20.0, 21.0, 22.0])) + self.table3.add_column('bar', 'ragged column', index=self._wrap([2, 3, 6]), + data=self._wrap(['t11', 't12', 't21', 't31', 't32', 't33'])) + self.table3.add_column('baz', 'multi-dimension column', + data=self._wrap([[210.0, 211.0, 212.0], + [220.0, 221.0, 222.0], + [230.0, 231.0, 232.0]])) + # generate expected dataframe for table3 + data = OrderedDict() + data['foo'] = [20.0, 21.0, 22.0] + data['bar'] = [['t11', 't12'], ['t21'], ['t31', 't32', 't33']] + data['baz'] = [[210.0, 211.0, 212.0], [220.0, 221.0, 222.0], [230.0, 231.0, 232.0]] + idx = [20, 21, 22] + self.table3_df = pd.DataFrame(data=data, index=pd.Index(name='id', data=idx)) - def test_single_item(self): - elem = self.table[0] + self.table2 = DynamicTable( + name='table2', + description='a test table', + id=[10, 11, 12] + ) + self.table2.add_column('foo', 'scalar column', data=self._wrap([10.0, 11.0, 12.0])) + self.table2.add_column('bar', 'ragged column', index=self._wrap([2, 3, 6]), + data=self._wrap(['s11', 's12', 's21', 's31', 's32', 's33'])) + self.table2.add_column('baz', 'multi-dimension column', + data=self._wrap([[110.0, 111.0, 112.0], + [120.0, 121.0, 122.0], + [130.0, 131.0, 132.0]])) + self.table2.add_column('qux', 'DTR column', table=self.table3, data=self._wrap([0, 1, 0])) + self.table2.add_column('corge', 'ragged DTR column', index=self._wrap([2, 3, 6]), table=self.table3, + data=self._wrap([0, 1, 2, 0, 1, 2])) + # TODO test when ragged DTR indices are not in ascending order + + # generate expected dataframe for table2 *without DTR* + data = OrderedDict() + data['foo'] = [10.0, 11.0, 12.0] + data['bar'] = [['s11', 's12'], ['s21'], ['s31', 's32', 's33']] + data['baz'] = [[110.0, 111.0, 112.0], [120.0, 121.0, 122.0], [130.0, 131.0, 132.0]] + idx = [10, 11, 12] + self.table2_df = pd.DataFrame(data=data, index=pd.Index(name='id', data=idx)) + + self.table1 = DynamicTable( + name='table1', + description='a table to test slicing', + id=[0, 1] + ) + self.table1.add_column('foo', 'scalar column', data=self._wrap([0.0, 1.0])) + self.table1.add_column('bar', 'ragged column', index=self._wrap([2, 3]), + data=self._wrap(['r11', 'r12', 'r21'])) + self.table1.add_column('baz', 'multi-dimension column', + data=self._wrap([[10.0, 11.0, 12.0], + [20.0, 21.0, 22.0]])) + self.table1.add_column('qux', 'DTR column', table=self.table2, data=self._wrap([0, 1])) + self.table1.add_column('corge', 'ragged DTR column', index=self._wrap([2, 3]), table=self.table2, + data=self._wrap([0, 1, 2])) + self.table1.add_column('barz', 'ragged column of tuples (cpd type)', index=self._wrap([2, 3]), + data=self._wrap([(1.0, 11), (2.0, 12), (3.0, 21)])) + + # generate expected dataframe for table1 *without DTR* + data = OrderedDict() + data['foo'] = self._wrap_check([0.0, 1.0]) + data['bar'] = [self._wrap_check(['r11', 'r12']), self._wrap_check(['r21'])] + data['baz'] = [self._wrap_check([10.0, 11.0, 12.0]), + self._wrap_check([20.0, 21.0, 22.0])] + data['barz'] = [self._wrap_check([(1.0, 11), (2.0, 12)]), self._wrap_check([(3.0, 21)])] + idx = [0, 1] + self.table1_df = pd.DataFrame(data=data, index=pd.Index(name='id', data=idx)) + + def _check_two_rows_df(self, rec): + data = OrderedDict() + data['foo'] = self._wrap_check([0.0, 1.0]) + data['bar'] = [self._wrap_check(['r11', 'r12']), self._wrap_check(['r21'])] + data['baz'] = [self._wrap_check([10.0, 11.0, 12.0]), + self._wrap_check([20.0, 21.0, 22.0])] + data['qux'] = self._wrap_check([0, 1]) + data['corge'] = [self._wrap_check([0, 1]), self._wrap_check([2])] + data['barz'] = [self._wrap_check([(1.0, 11), (2.0, 12)]), self._wrap_check([(3.0, 21)])] + idx = [0, 1] + exp = pd.DataFrame(data=data, index=pd.Index(name='id', data=idx)) + pd.testing.assert_frame_equal(rec, exp) + + def _check_two_rows_df_nested(self, rec): + # first level: cache nested df cols and remove them before calling pd.testing.assert_frame_equal + qux_series = rec['qux'] + corge_series = rec['corge'] + del rec['qux'] + del rec['corge'] + + idx = [0, 1] + pd.testing.assert_frame_equal(rec, self.table1_df.loc[idx]) + + # second level: compare the nested columns separately + self.assertEqual(len(qux_series), 2) + rec_qux1 = qux_series[0] + rec_qux2 = qux_series[1] + self._check_table2_first_row_qux(rec_qux1) + self._check_table2_second_row_qux(rec_qux2) + + self.assertEqual(len(corge_series), 2) + rec_corge1 = corge_series[0] + rec_corge2 = corge_series[1] + self._check_table2_first_row_corge(rec_corge1) + self._check_table2_second_row_corge(rec_corge2) + + def _check_one_row_df(self, rec): data = OrderedDict() - data['foo'] = 0.0 - data['bar'] = [np.array(['r11', 'r12'])] - data['baz'] = [np.array([10.0, 11.0, 12.0])] + data['foo'] = self._wrap_check([0.0]) + data['bar'] = [self._wrap_check(['r11', 'r12'])] + data['baz'] = [self._wrap_check([10.0, 11.0, 12.0])] + data['qux'] = self._wrap_check([0]) + data['corge'] = [self._wrap_check([0, 1])] + data['barz'] = [self._wrap_check([(1.0, 11), (2.0, 12)])] idx = [0] exp = pd.DataFrame(data=data, index=pd.Index(name='id', data=idx)) - pd.testing.assert_frame_equal(elem, exp) + pd.testing.assert_frame_equal(rec, exp) + + def _check_one_row_df_nested(self, rec): + # first level: cache nested df cols and remove them before calling pd.testing.assert_frame_equal + qux_series = rec['qux'] + corge_series = rec['corge'] + del rec['qux'] + del rec['corge'] + + idx = [0] + pd.testing.assert_frame_equal(rec, self.table1_df.loc[idx]) + + # second level: compare the nested columns separately + self.assertEqual(len(qux_series), 1) + rec_qux = qux_series[0] + self._check_table2_first_row_qux(rec_qux) + + self.assertEqual(len(corge_series), 1) + rec_corge = corge_series[0] + self._check_table2_first_row_corge(rec_corge) + + def _check_table2_first_row_qux(self, rec_qux): + # second level: cache nested df cols and remove them before calling pd.testing.assert_frame_equal + qux_qux_series = rec_qux['qux'] + qux_corge_series = rec_qux['corge'] + del rec_qux['qux'] + del rec_qux['corge'] + + qux_idx = [10] + pd.testing.assert_frame_equal(rec_qux, self.table2_df.loc[qux_idx]) + + # third level: compare the nested columns separately + self.assertEqual(len(qux_qux_series), 1) + pd.testing.assert_frame_equal(qux_qux_series[qux_idx[0]], self.table3_df.iloc[[0]]) + self.assertEqual(len(qux_corge_series), 1) + pd.testing.assert_frame_equal(qux_corge_series[qux_idx[0]], self.table3_df.iloc[[0, 1]]) + + def _check_table2_second_row_qux(self, rec_qux): + # second level: cache nested df cols and remove them before calling pd.testing.assert_frame_equal + qux_qux_series = rec_qux['qux'] + qux_corge_series = rec_qux['corge'] + del rec_qux['qux'] + del rec_qux['corge'] + + qux_idx = [11] + pd.testing.assert_frame_equal(rec_qux, self.table2_df.loc[qux_idx]) + + # third level: compare the nested columns separately + self.assertEqual(len(qux_qux_series), 1) + pd.testing.assert_frame_equal(qux_qux_series[qux_idx[0]], self.table3_df.iloc[[1]]) + self.assertEqual(len(qux_corge_series), 1) + pd.testing.assert_frame_equal(qux_corge_series[qux_idx[0]], self.table3_df.iloc[[2]]) + + def _check_table2_first_row_corge(self, rec_corge): + # second level: cache nested df cols and remove them before calling pd.testing.assert_frame_equal + corge_qux_series = rec_corge['qux'] + corge_corge_series = rec_corge['corge'] + del rec_corge['qux'] + del rec_corge['corge'] + + corge_idx = [10, 11] + pd.testing.assert_frame_equal(rec_corge, self.table2_df.loc[corge_idx]) + + # third level: compare the nested columns separately + self.assertEqual(len(corge_qux_series), 2) + pd.testing.assert_frame_equal(corge_qux_series[corge_idx[0]], self.table3_df.iloc[[0]]) + pd.testing.assert_frame_equal(corge_qux_series[corge_idx[1]], self.table3_df.iloc[[1]]) + self.assertEqual(len(corge_corge_series), 2) + pd.testing.assert_frame_equal(corge_corge_series[corge_idx[0]], self.table3_df.iloc[[0, 1]]) + pd.testing.assert_frame_equal(corge_corge_series[corge_idx[1]], self.table3_df.iloc[[2]]) + + def _check_table2_second_row_corge(self, rec_corge): + # second level: cache nested df cols and remove them before calling pd.testing.assert_frame_equal + corge_qux_series = rec_corge['qux'] + corge_corge_series = rec_corge['corge'] + del rec_corge['qux'] + del rec_corge['corge'] + + corge_idx = [12] + pd.testing.assert_frame_equal(rec_corge, self.table2_df.loc[corge_idx]) + + # third level: compare the nested columns separately + self.assertEqual(len(corge_qux_series), 1) + pd.testing.assert_frame_equal(corge_qux_series[corge_idx[0]], self.table3_df.iloc[[0]]) + self.assertEqual(len(corge_corge_series), 1) + pd.testing.assert_frame_equal(corge_corge_series[corge_idx[0]], self.table3_df.iloc[[0, 1, 2]]) + + def _check_two_rows_no_df(self, rec): + self.assertEqual(rec[0], [0, 1]) + np.testing.assert_array_equal(rec[1], self._wrap_check([0.0, 1.0])) + expected = [self._wrap_check(['r11', 'r12']), self._wrap_check(['r21'])] + self._assertNestedRaggedArrayEqual(rec[2], expected) + np.testing.assert_array_equal(rec[3], self._wrap_check([[10.0, 11.0, 12.0], [20.0, 21.0, 22.0]])) + np.testing.assert_array_equal(rec[4], self._wrap_check([0, 1])) + expected = [self._wrap_check([0, 1]), self._wrap_check([2])] + for i, j in zip(rec[5], expected): + np.testing.assert_array_equal(i, j) + + def _check_one_row_no_df(self, rec): + self.assertEqual(rec[0], 0) + self.assertEqual(rec[1], 0.0) + np.testing.assert_array_equal(rec[2], self._wrap_check(['r11', 'r12'])) + np.testing.assert_array_equal(rec[3], self._wrap_check([10.0, 11.0, 12.0])) + self.assertEqual(rec[4], 0) + np.testing.assert_array_equal(rec[5], self._wrap_check([0, 1])) + np.testing.assert_array_equal(rec[6], self._wrap_check([(1.0, 11), (2.0, 12)])) + + def _check_one_row_multiselect_no_df(self, rec): + # difference from _check_one_row_no_df is that everything is wrapped in a list + self.assertEqual(rec[0], [0]) + self.assertEqual(rec[1], [0.0]) + np.testing.assert_array_equal(rec[2], [self._wrap_check(['r11', 'r12'])]) + np.testing.assert_array_equal(rec[3], [self._wrap_check([10.0, 11.0, 12.0])]) + self.assertEqual(rec[4], [0]) + np.testing.assert_array_equal(rec[5], [self._wrap_check([0, 1])]) + np.testing.assert_array_equal(rec[6], [self._wrap_check([(1.0, 11), (2.0, 12)])]) + + def _assertNestedRaggedArrayEqual(self, arr1, arr2): + """ + This is a helper function for _check_two_rows_no_df. + It compares arrays or lists containing numpy arrays that may be ragged + """ + self.assertEqual(type(arr1), type(arr2)) + self.assertEqual(len(arr1), len(arr2)) + if isinstance(arr1, np.ndarray): + if arr1.dtype == object: # both are arrays containing arrays, lists, or h5py.Dataset strings + for i, j in zip(arr1, arr2): + self._assertNestedRaggedArrayEqual(i, j) + elif np.issubdtype(arr1.dtype, np.number): + np.testing.assert_allclose(arr1, arr2) + else: + np.testing.assert_array_equal(arr1, arr2) + elif isinstance(arr1, list): + for i, j in zip(arr1, arr2): + self._assertNestedRaggedArrayEqual(i, j) + else: # scalar + self.assertEqual(arr1, arr2) + + def test_single_item(self): + rec = self.table1[0] + self._check_one_row_df(rec) + + def test_single_item_nested(self): + rec = self.table1.get(0, index=False) + self._check_one_row_df_nested(rec) def test_single_item_no_df(self): - elem = self.table.get(0, df=False) - self.assertEqual(elem[0], 0) - self.assertEqual(elem[1], 0.0) - np.testing.assert_array_equal(elem[2], np.array(['r11', 'r12'])) - np.testing.assert_array_equal(elem[3], np.array([10.0, 11.0, 12.0])) + rec = self.table1.get(0, df=False) + self._check_one_row_no_df(rec) def test_slice(self): - elem = self.table[0:2] - data = OrderedDict() - data['foo'] = [0.0, 1.0] - data['bar'] = [np.array(['r11', 'r12']), np.array(['r21'])] - data['baz'] = [np.array([10.0, 11.0, 12.0]), - np.array([20.0, 21.0, 22.0])] - idx = [0, 1] - exp = pd.DataFrame(data=data, index=pd.Index(name='id', data=idx)) - pd.testing.assert_frame_equal(elem, exp) + rec = self.table1[0:2] + self._check_two_rows_df(rec) + + def test_slice_nested(self): + rec = self.table1.get(slice(0, 2), index=False) + self._check_two_rows_df_nested(rec) def test_slice_no_df(self): - elem = self.table.get(slice(0, 2), df=False) - self.assertEqual(elem[0], [0, 1]) - np.testing.assert_array_equal(elem[1], np.array([0.0, 1.0])) - np.testing.assert_array_equal(elem[2][0], np.array(['r11', 'r12'])) - np.testing.assert_array_equal(elem[2][1], np.array(['r21'])) - np.testing.assert_array_equal(elem[3], np.array([[10.0, 11.0, 12.0], [20.0, 21.0, 22.0]])) + rec = self.table1.get(slice(0, 2), df=False) + self._check_two_rows_no_df(rec) + + def test_slice_single(self): + rec = self.table1[0:1] + self._check_one_row_df(rec) + + def test_slice_single_nested(self): + rec = self.table1.get(slice(0, 1), index=False) + self._check_one_row_df_nested(rec) + + def test_slice_single_no_df(self): + rec = self.table1.get(slice(0, 1), df=False) + self._check_one_row_multiselect_no_df(rec) def test_list(self): - elem = self.table[[0, 1]] - data = OrderedDict() - data['foo'] = [0.0, 1.0] - data['bar'] = [np.array(['r11', 'r12']), np.array(['r21'])] - data['baz'] = [np.array([10.0, 11.0, 12.0]), - np.array([20.0, 21.0, 22.0])] - idx = [0, 1] - exp = pd.DataFrame(data=data, index=pd.Index(name='id', data=idx)) - pd.testing.assert_frame_equal(elem, exp) + rec = self.table1[[0, 1]] + self._check_two_rows_df(rec) + + def test_list_nested(self): + rec = self.table1.get([0, 1], index=False) + self._check_two_rows_df_nested(rec) def test_list_no_df(self): - elem = self.table.get([0, 1], df=False) - self.assertEqual(elem[0], [0, 1]) - np.testing.assert_array_equal(elem[1], np.array([0.0, 1.0])) - np.testing.assert_array_equal(elem[2][0], np.array(['r11', 'r12'])) - np.testing.assert_array_equal(elem[2][1], np.array(['r21'])) - np.testing.assert_array_equal(elem[3], np.array([[10.0, 11.0, 12.0], [20.0, 21.0, 22.0]])) + rec = self.table1.get([0, 1], df=False) + self._check_two_rows_no_df(rec) + + def test_list_single(self): + rec = self.table1[[0]] + self._check_one_row_df(rec) + + def test_list_single_nested(self): + rec = self.table1.get([0], index=False) + self._check_one_row_df_nested(rec) + + def test_list_single_no_df(self): + rec = self.table1.get([0], df=False) + self._check_one_row_multiselect_no_df(rec) + + def test_array(self): + rec = self.table1[np.array([0, 1])] + self._check_two_rows_df(rec) + + def test_array_nested(self): + rec = self.table1.get(np.array([0, 1]), index=False) + self._check_two_rows_df_nested(rec) + + def test_array_no_df(self): + rec = self.table1.get(np.array([0, 1]), df=False) + self._check_two_rows_no_df(rec) + + def test_array_single(self): + rec = self.table1[np.array([0])] + self._check_one_row_df(rec) + + def test_array_single_nested(self): + rec = self.table1.get(np.array([0]), index=False) + self._check_one_row_df_nested(rec) + + def test_array_single_no_df(self): + rec = self.table1.get(np.array([0]), df=False) + self._check_one_row_multiselect_no_df(rec) + + def test_to_dataframe_nested(self): + rec = self.table1.to_dataframe() + self._check_two_rows_df_nested(rec) + + def test_to_dataframe(self): + rec = self.table1.to_dataframe(index=True) + self._check_two_rows_df(rec) + + +class TestSelectionArray(SelectionTestMixin, TestCase): + + def _wrap(self, my_list): + return np.array(my_list) + + def _wrap_check(self, my_list): + return self._wrap(my_list) + + +class TestSelectionList(SelectionTestMixin, TestCase): + + def _wrap(self, my_list): + return my_list + + def _wrap_check(self, my_list): + return self._wrap(my_list) + + +class TestSelectionH5Dataset(SelectionTestMixin, TestCase): + + def setUp(self): + self.path = get_temp_filepath() + self.file = h5py.File(self.path, 'w') + self.dset_counter = 0 + super().setUp() + + def tearDown(self): + super().tearDown() + self.file.close() + if os.path.exists(self.path): + os.remove(self.path) + + def _wrap(self, my_list): + self.dset_counter = self.dset_counter + 1 + kwargs = dict() + if isinstance(my_list[0], str): + kwargs['dtype'] = H5_TEXT + elif isinstance(my_list[0], tuple): # compound dtype + # normally for cpd dtype, __resolve_dtype__ takes a list of DtypeSpec objects + cpd_type = [dict(name='cpd_float', dtype=np.dtype('float64')), + dict(name='cpd_int', dtype=np.dtype('int32'))] + kwargs['dtype'] = HDF5IO.__resolve_dtype__(cpd_type, my_list[0]) + dset = self.file.create_dataset('dset%d' % self.dset_counter, data=np.array(my_list, **kwargs)) + if H5PY_3 and isinstance(my_list[0], str): + return StrDataset(dset, None) # return a wrapper to read data as str instead of bytes + else: + # NOTE: h5py.Dataset with compound dtype are read as numpy arrays with compound dtype, not tuples + return dset + + def _wrap_check(self, my_list): + # getitem on h5dataset backed data will return np.array + kwargs = dict() + if isinstance(my_list[0], str): + kwargs['dtype'] = H5_TEXT + elif isinstance(my_list[0], tuple): + cpd_type = [dict(name='cpd_float', dtype=np.dtype('float64')), + dict(name='cpd_int', dtype=np.dtype('int32'))] + kwargs['dtype'] = np.dtype([(x['name'], x['dtype']) for x in cpd_type]) + # compound dtypes with str are read as bytes, see https://github.com/h5py/h5py/issues/1751 + return np.array(my_list, **kwargs) class TestVectorIndex(TestCase):