diff --git a/CHANGELOG.md b/CHANGELOG.md index e45aa6b4d..4d38b262f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,6 @@ - Add experimental namespace to HDMF common schema. New data types should go in the experimental namespace (hdmf-experimental) prior to being added to the core (hdmf-common) namespace. The purpose of this is to provide a place to test new data types that may break backward compatibility as they are refined. @ajtritt (#545) - - Add `EnumData` type for storing data that comes from a fixed set of values. This replaces `VocabData` i.e. `VocabData` has been removed. `VocabData` stored vocabulary elements in an attribute, which has a size limit. `EnumData` now stores elements in a separate dataset, referenced by an attribute stored on the `EnumData` dataset. @@ -20,14 +19,19 @@ Each sub-table is itself a DynamicTable that is aligned with the main table by row index. Each subtable defines a sub-category in the main table effectively creating a table with sub-headings to organize columns. @oruebel (#551) +- Equality check for `DynamicTable` now also checks that the name and description of the table are the same. @rly (#566) ### Internal improvements - Update CI and copyright year. @rly (#523, #524) +- Equality check for `DynamicTable` returns False if the other object is a `DynamicTable` instead of raising an error. + @rly (#566) ### Bug fixes - Fix CI testing on Python 3.9. @rly (#523) - Fix certain edge cases where `GroupValidator` would not validate all of the child groups or datasets attached to a `GroupBuilder`. @dsleiter (#526) +- Fix generation of classes that extends both `MultiContainerInterface` and another class that extends + `MultiContainerInterface`. @rly (#567) ## HDMF 2.4.0 (February 23, 2021) diff --git a/docs/gallery/dynamictable.py b/docs/gallery/dynamictable.py index faf473c81..d51e56406 100644 --- a/docs/gallery/dynamictable.py +++ b/docs/gallery/dynamictable.py @@ -12,8 +12,8 @@ # The :py:class:`~hdmf.common.table.DynamicTable` class represents a column-based table # to which you can add custom columns. It consists of a name, a description, a list of # row IDs, and a list of columns. Columns are represented by -# :py:class:`~hdmf.common.table.VectorData` and :py:class:`~hdmf.common.table.VectorIndex` -# objects. +# :py:class:`~hdmf.common.table.VectorData`, :py:class:`~hdmf.common.table.VectorIndex`, +# and :py:class:`~hdmf.common.table.DynamicTableRegion` objects. ############################################################################### # Constructing a table @@ -32,7 +32,7 @@ ############################################################################### # Initializing columns # -------------------- -# You can initialize a :py:class:`~hdmf.common.table.DynamicTable` with particular +# You can create a :py:class:`~hdmf.common.table.DynamicTable` with particular # columns by passing a list or tuple of # :py:class:`~hdmf.common.table.VectorData` objects for the ``columns`` argument # in the constructor. @@ -73,10 +73,25 @@ id=[100, 200], ) +############################################################################### +# If a list of integers in passed to ``id``, +# :py:class:`~hdmf.common.table.DynamicTable` automatically creates +# an :py:class:`~hdmf.common.table.ElementIdentifiers` object, which is the data type +# that stores row IDs. The above command is equivalent to + +from hdmf.common.table import ElementIdentifiers + +table_set_ids = DynamicTable( + name='my table', + description='an example table', + columns=[col1, col2], + id=ElementIdentifiers(name='id', data=[100, 200]), +) + ############################################################################### # Adding rows # ----------- -# You can add rows to a :py:class:`~hdmf.common.table.DynamicTable` using +# You can also add rows to a :py:class:`~hdmf.common.table.DynamicTable` using # :py:meth:`DynamicTable.add_row `. # A keyword argument for every column in the table must be supplied. @@ -86,10 +101,9 @@ ) ############################################################################### -# You can also supply an optional row ID to +# You can supply an optional row ID to # :py:meth:`DynamicTable.add_row `. -# If no ID is supplied, the ID is automatically set to the number of rows in the table -# prior to adding the new row (i.e., automatic IDs start at 0). +# If no ID is supplied, the automatic row IDs count up from 0. table.add_row( col1=4, @@ -118,9 +132,35 @@ data=[True, True, False, True], # specify data for the 4 rows in the table ) +############################################################################### +# Enumerated Data +# --------------- +# :py:class:`~hdmf.common.table.EnumData` is a special type of column for storing +# an enumerated data type. This way each unique value is stored once, and the data +# references those values by index. Using this method is more efficient than storing +# a single value many types, and has the advantage of communicating to downstream +# tools that the data is categorical in nature. + +from hdmf.common.table import EnumData + +# this column has a length of 5, not 3 +enum_col = EnumData( + name="cell_type", + description="this column holds categorical variables", + data=[0, 1, 2, 1, 0], + elements=["aa", "bb", "cc"] +) + +my_table = DynamicTable( + name='my table', + description='an example table', + columns=[enum_col], +) + + ############################################################################### # Ragged array columns -# ^^^^^^^^^^^^^^^^^^^^ +# -------------------- # A table column with a different number of elements for each row is called a # ragged array. To initialize a :py:class:`~hdmf.common.table.DynamicTable` # with a ragged array column, pass both @@ -150,9 +190,10 @@ #################################################################################### # VectorIndex.data provides the indices for how to break VectorData.data into cells # -# You can add a ragged array column to an existing +# You can add an empty ragged array column to an existing # :py:class:`~hdmf.common.table.DynamicTable` by specifying ``index=True`` # to :py:meth:`DynamicTable.add_column `. +# This method only works if run before any rows have been added to the table. new_table = DynamicTable( name='my table', @@ -179,6 +220,123 @@ ) ############################################################################### +# Referencing rows of other tables +# -------------------------------- +# You can create a column that references rows of another table using adding a +# :py:class:`~hdmf.common.table.DynamicTableRegion` object as a column of your +# :py:class:`~hdmf.common.table.DynamicTable`. This is analogous to +# a foreign key in a relational database. + +from hdmf.common.table import DynamicTableRegion + +dtr_col = DynamicTableRegion( + name='table1_ref', + description='references rows of earlier table', + data=[0, 1, 0, 0], + table=table +) + +data_col = VectorData( + name='col2', + description='column #2', + data=['a', 'a', 'a', 'b'], +) + +table2 = DynamicTable( + name='my table', + description='an example table', + columns=[dtr_col, data_col], +) + +############################################################################### +# Here, the ``data`` of ``dtr_col`` maps to rows of ``table`` (0-indexed). +# +# .. note:: +# The ``data`` values of :py:class:`~hdmf.common.table.DynamicTableRegion` map to the row +# index, not the row ID, though if you are using default IDs. these values will be the +# same. +# +# Reference more than one row of another table with a +# :py:class:`~hdmf.common.table.DynamicTableRegion` indexed by a +# :py:class:`~hdmf.common.table.VectorIndex`. + +indexed_dtr_col = DynamicTableRegion( + name='table1_ref2', + description='references multiple rows of earlier table', + data=[0, 0, 1, 1, 0, 0, 1], + table=table +) + +dtr_idx = VectorIndex( + name='table1_ref2_index', + target=indexed_dtr_col, + data=[2, 3, 5, 7], +) + +table3 = DynamicTable( + name='my table', + description='an example table', + columns=[dtr_idx, indexed_dtr_col], +) + +############################################################################### +# Creating an expandable table +# ---------------------------- +# When using the default HDF5 backend, each column of these tables is an HDF5 Dataset, +# which by default are set in size. This means that once a file is written, it is not +# possible to add a new row. If you want to be able to save this file, load it, and add +# more rows to the table, you will need to set this up when you create the +# :py:class:`~hdmf.common.table.DynamicTable`. You do this by wrapping the data with +# :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO`. + +from hdmf.backends.hdf5.h5_utils import H5DataIO + +col1 = VectorData( + name='expandable col1', + description='column #1', + data=H5DataIO(data=[1, 2], maxshape=(None,)), +) +col2 = VectorData( + name='expandable col2', + description='column #2', + data=H5DataIO(data=['a', 'b'], maxshape=(None,)), +) + +# Don't forget to wrap the row IDs too! +ids = ElementIdentifiers( + name='id', + data=H5DataIO( + data=[0, 1], + maxshape=(None,) + ) +) + +expandable_table = DynamicTable( + name='table that can be expanded after being saved to file', + description='an example table', + columns=[col1, col2], + id=ids, +) + +############################################################################### +# Now you can write the file, read it back, and run ``expandable_table.add_row()``. +# In this example, we are setting ``maxshape`` to ``(None,)``, which means this is a +# 1-dimensional matrix that can expand indefinitely along its single dimension. You +# could also use an integer in place of ``None``. For instance, ``maxshape=(8,)`` would +# allow the column to grow up to a length of 8. Whichever ``maxshape`` you choose, +# it should be the same for all :py:class:`~hdmf.common.table.VectorData`, +# :py:class:`~hdmf.common.table.ElementIdentifiers`, and +# :py:class:`~hdmf.common.table.DynamicTableRegion` objects in the +# :py:class:`~hdmf.common.table.DynamicTable`, since they must always be the same +# length. The default :py:class:`~hdmf.common.table.ElementIdentifiers` automatically +# generated when you pass a list of integers to the ``id`` argument of the +# :py:class:`~hdmf.common.table.DynamicTable` constructor is not expandable, so do not +# forget to create a :py:class:`~hdmf.common.table.ElementIdentifiers` object, and wrap +# that data as well. If any of the columns are indexed, the ``data`` arg of +# :py:class:`~hdmf.common.table.VectorIndex` will also need to be wrapped in +# :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO`. +# +# # Converting the table to a pandas ``DataFrame`` # ---------------------------------------------- # `pandas`_ is a popular data analysis tool, especially for working with tabular data. @@ -466,11 +624,6 @@ table_double_ragged_col['col6'] # returns col6_ind_ind table_double_ragged_col.col6 # returns col6 -############################################################################### -# Referencing rows of a DynamicTable -# ---------------------------------- -# TODO - ############################################################################### # Creating custom DynamicTable subclasses # --------------------------------------- diff --git a/src/hdmf/build/classgenerator.py b/src/hdmf/build/classgenerator.py index 4f635cd8d..cd4deb580 100644 --- a/src/hdmf/build/classgenerator.py +++ b/src/hdmf/build/classgenerator.py @@ -339,4 +339,9 @@ def post_process(cls, classdict, bases, docval_args, spec): :param spec: The spec for the container class to generate. """ if '__clsconf__' in classdict: - bases.insert(0, MultiContainerInterface) + # do not add MCI as a base if a base is already a subclass of MultiContainerInterface + for b in bases: + if issubclass(b, MultiContainerInterface): + break + else: + bases.insert(0, MultiContainerInterface) diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index d6dcc2bb7..2822df0e3 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -583,18 +583,22 @@ def add_row(self, **kwargs): c.add_row(data[colname]) def __eq__(self, other): - """ - Compare if the two DynamicTables contain the same data + """Compare if the two DynamicTables contain the same data. - This implemented by converting the DynamicTables to a pandas dataframe and - comparing the equality of the two tables. + First this returns False if the other DynamicTable has a different name or + description. Then, this table and the other table are converted to pandas + dataframes and the equality of the two tables is returned. :param other: DynamicTable to compare to - :raises: An error will be raised with to_dataframe is not defined or other - :return: Bool indicating whether the two DynamicTables contain the same data """ + if other is self: + return True + if not isinstance(other, DynamicTable): + return False + if self.name != other.name or self.description != other.description: + return False return self.to_dataframe().equals(other.to_dataframe()) @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, # noqa: C901 @@ -1176,12 +1180,12 @@ class EnumData(VectorData): __fields__ = ('elements', ) - @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, + @docval({'name': 'name', 'type': str, 'doc': 'the name of this column'}, {'name': 'description', 'type': str, 'doc': 'a description for this column'}, {'name': 'data', 'type': ('array_data', 'data'), - 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()}, + 'doc': 'integers that index into elements for the value of each row', 'default': list()}, {'name': 'elements', 'type': ('array_data', 'data', VectorData), 'default': list(), - 'doc': 'the items in this elements'}) + 'doc': 'lookup values for each integer in ``data``'}) def __init__(self, **kwargs): elements = popargs('elements', kwargs) super().__init__(**kwargs) diff --git a/tests/unit/build_tests/test_classgenerator.py b/tests/unit/build_tests/test_classgenerator.py index f7aa529d4..d25f3b157 100644 --- a/tests/unit/build_tests/test_classgenerator.py +++ b/tests/unit/build_tests/test_classgenerator.py @@ -828,9 +828,6 @@ def test_post_process(self): groups=[ GroupSpec(data_type_inc='EmptyBar', doc='test multi', quantity='*') ], - attributes=[ - AttributeSpec(name='attr3', doc='a float attribute', dtype='float') - ] ) classdict = dict( __clsconf__=[ @@ -847,3 +844,31 @@ def test_post_process(self): docval_args = [] MCIClassGenerator.post_process(classdict, bases, docval_args, multi_spec) self.assertEqual(bases, [MultiContainerInterface, Container]) + + def test_post_process_already_multi(self): + class Multi1(MultiContainerInterface): + pass + + multi_spec = GroupSpec( + doc='A test extension that contains a multi and extends a multi', + data_type_def='Multi2', + data_type_inc='Multi1', + groups=[ + GroupSpec(data_type_inc='EmptyBar', doc='test multi', quantity='*') + ], + ) + classdict = dict( + __clsconf__=[ + dict( + attr='empty_bars', + type=EmptyBar, + add='add_empty_bars', + get='get_empty_bars', + create='create_empty_bars' + ) + ] + ) + bases = [Multi1] + docval_args = [] + MCIClassGenerator.post_process(classdict, bases, docval_args, multi_spec) + self.assertEqual(bases, [Multi1]) diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index e498fa0d6..db4587d81 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -551,6 +551,63 @@ def test_multidim_col_one_elt_tuple(self): pd.testing.assert_frame_equal(df, df2) pd.testing.assert_frame_equal(table.get(0), df2) + def test_eq(self): + columns = [ + VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data) + ] + test_table = DynamicTable("with_columns_and_data", 'a test table', columns=columns) + + table = self.with_columns_and_data() + self.assertTrue(table == test_table) + + def test_eq_from_df(self): + df = pd.DataFrame({ + 'foo': [1, 2, 3, 4, 5], + 'bar': [10.0, 20.0, 30.0, 40.0, 50.0], + 'baz': ['cat', 'dog', 'bird', 'fish', 'lizard'] + }).loc[:, ('foo', 'bar', 'baz')] + + test_table = DynamicTable.from_dataframe(df, 'with_columns_and_data', table_description='a test table') + table = self.with_columns_and_data() + self.assertTrue(table == test_table) + + def test_eq_diff_missing_col(self): + columns = [ + VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data) + ] + del columns[-1] + test_table = DynamicTable("with_columns_and_data", 'a test table', columns=columns) + + table = self.with_columns_and_data() + self.assertFalse(table == test_table) + + def test_eq_diff_name(self): + columns = [ + VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data) + ] + test_table = DynamicTable("wrong name", 'a test table', columns=columns) + + table = self.with_columns_and_data() + self.assertFalse(table == test_table) + + def test_eq_diff_desc(self): + columns = [ + VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data) + ] + test_table = DynamicTable("with_columns_and_data", 'wrong description', columns=columns) + + table = self.with_columns_and_data() + self.assertFalse(table == test_table) + + def test_eq_bad_type(self): + container = Container('test_container') + table = self.with_columns_and_data() + self.assertFalse(table == container) + class TestDynamicTableRoundTrip(H5RoundTripMixin, TestCase):