Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a TODO to start implementation of HED support in annotations #13059

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions mne/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ __all__ = [
"Evoked",
"EvokedArray",
"Forward",
"HEDAnnotations",
"Info",
"Label",
"MixedSourceEstimate",
Expand Down Expand Up @@ -260,6 +261,7 @@ from ._freesurfer import (
)
from .annotations import (
Annotations,
HEDAnnotations,
annotations_from_events,
count_annotations,
events_from_annotations,
Expand Down
163 changes: 162 additions & 1 deletion mne/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
verbose,
warn,
)
from .utils.check import _soft_import

# For testing windows_like_datetime, we monkeypatch "datetime" in this module.
# Keep the true datetime object around for _validate_type use.
Expand Down Expand Up @@ -151,6 +152,7 @@ class Annotations:
--------
mne.annotations_from_events
mne.events_from_annotations
mne.HEDAnnotations

Notes
-----
Expand Down Expand Up @@ -288,7 +290,7 @@ def orig_time(self):

def __eq__(self, other):
"""Compare to another Annotations instance."""
if not isinstance(other, Annotations):
if not isinstance(other, type(self)):
return False
return (
np.array_equal(self.onset, other.onset)
Expand Down Expand Up @@ -567,6 +569,8 @@ def _sort(self):
self.duration = self.duration[order]
self.description = self.description[order]
self.ch_names = self.ch_names[order]
if hasattr(self, "hed_strings"):
self.hed_strings = self.hed_strings[order]

@verbose
def crop(
Expand Down Expand Up @@ -758,6 +762,163 @@ def rename(self, mapping, verbose=None):
return self


class HEDAnnotations(Annotations):
"""Annotations object for annotating segments of raw data with HED tags.

Parameters
----------
onset : array of float, shape (n_annotations,)
The starting time of annotations in seconds after ``orig_time``.
duration : array of float, shape (n_annotations,) | float
Durations of the annotations in seconds. If a float, all the
annotations are given the same duration.
description : array of str, shape (n_annotations,) | str
Array of strings containing description for each annotation. If a
string, all the annotations are given the same description. To reject
epochs, use description starting with keyword 'bad'. See example above.
hed_strings : array of str, shape (n_annotations,) | str
Sequence of strings containing a HED tag (or comma-separated list of HED tags)
for each annotation. If a single string is provided, all annotations are
assigned the same HED string.
hed_version : str
The HED schema version against which to validate the HED strings.
orig_time : float | str | datetime | tuple of int | None
A POSIX Timestamp, datetime or a tuple containing the timestamp as the
first element and microseconds as the second element. Determines the
starting time of annotation acquisition. If None (default),
starting time is determined from beginning of raw data acquisition.
In general, ``raw.info['meas_date']`` (or None) can be used for syncing
the annotations with raw data if their acquisition is started at the
same time. If it is a string, it should conform to the ISO8601 format.
More precisely to this '%%Y-%%m-%%d %%H:%%M:%%S.%%f' particular case of
the ISO8601 format where the delimiter between date and time is ' '.
%(ch_names_annot)s

See Also
--------
mne.Annotations

Notes
-----

.. versionadded:: 1.10
"""

def __init__(
self,
onset,
duration,
description,
hed_strings,
hed_version="8.3.0", # TODO @VisLab what is a sensible default here?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VisLab what is a sensible default for schema version?

orig_time=None,
ch_names=None,
):
self.hed = _soft_import("hed", "validation of HED tags in annotations")

super().__init__(
onset=onset,
duration=duration,
description=description,
orig_time=orig_time,
ch_names=ch_names,
)
self.hed_version = hed_version
self._update_hed_strings(hed_strings=hed_strings)

def _update_hed_strings(self, hed_strings):
# NB: must import; calling self.hed.validator.HedValidator doesn't work
from hed.validator import HedValidator

if len(hed_strings) != len(self):
raise ValueError(
f"Number of HED strings ({len(hed_strings)}) must match the number of "
f"annotations ({len(self)})."
)
# validation of HED strings
schema = self.hed.load_schema_version(self.hed_version)
validator = HedValidator(schema)
error_handler = self.hed.errors.ErrorHandler(check_for_warnings=False)
error_strs = [
self._validate_one_hed_string(hs, schema, validator, error_handler)
for hs in hed_strings
]
if any(map(len, error_strs)):
raise ValueError(
"Some HED strings in your annotations failed to validate:\n - "
+ "\n - ".join(error_strs)
)
self.hed_strings = hed_strings

def _validate_one_hed_string(self, hed_string, schema, validator, error_handler):
"""Validate a user-provided HED string."""
hs = self.hed.HedString(hed_string, schema)
issues = validator.validate(
hs, allow_placeholders=False, error_handler=error_handler
)
return self.hed.get_printable_issue_string(issues)

def __eq__(self, other):
"""Compare to another HEDAnnotations instance."""
return (
super().__eq__(self, other)
and np.array_equal(self.hed_strings, other.hed_strings)
and self.hed_version == other.hed_version
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VisLab if we want to compare equality of two HEDAnnotations objects, and we know already that their HED Strings are equivalent, should we care that they were validated with different HED schema versions?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tags are the same, but it should be re-validated using the latest version of the schema.

RE: Although once a tag is in the schema, it is always there (unless there is a major version change which we don't anticipate and even then -- every effort would be made to keep tags). This being said, the schema tags have attributes which may affect how they are validated -- also they might also have a different path in the hierarchy as upper level tags are added. (That is why the annotations should use the short form as much as possible and use tools to expand if needed.)

In other words -- if you have two datasets and they have different versions of the schema then I think it should work if you revalidate using the latest of the two versions of the schema. (Am I correctly understanding that within a given dataset the files would use a single version of HED?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe rephrasing my question will help: if I have 2 HED Strings, and as strings they are identical (i.e., they both say "sensory-event, visual-presentation, (blue, square)"), does it even make sense to say "these aren't equal" simply because one was validated against schema version X and another was validated against schema version X+1? When I phrase the question that way, the answer seems obvious to me: schema version doesn't matter when comparing equality of the strings (and thus there is no point to doing the extra computation of re-validating the strings against the newer schema version when testing their equality). But I'm still not clear on whether you'd agree with that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(this might also help: MNE-Python does not deal with datasets. That is the job of MNE-BIDS. Within MNE-BIDS, I think it is safe to assert/require that only one schema version is used to do all validation of annotations within that dataset. So the question I'm asking is really about the collection of HED strings attached to a single recording and what counts as "the same" when talking about those HED strings)

Copy link
Author

@VisLab VisLab Feb 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of the tools require a single non-conflicting schema version specification. So we are talking about whether two HedString are the same. In looking at the HedString code, it looks like it assumes that the tags have been "sorted" within the string. This is a method sort n the parent class HedGroup , which updates the order internally to put it in "canonical form". On this version the __eq__ method detects whether two HedString objects are the same.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are talking about whether two HedString are the same.

not in this context! I think that is the root of our misunderstanding.

The code (in MNE-Python) that this question is attached to checks equality of HEDAnnotations objects. As part of that, I'm proposing that it should look at the hed_strings entries, and maybe also at the schema versions. In that context, the HED Strings are just plain python strings (type str), they are not hed.HedString instances, and the schema is stored only as a version string (e.g., "8.3.0").

So the question is, how should we assess "equality" of two HEDAnnotations instances? In particular, do we:

  1. check equality of the strings as strings, and call it good?
  2. also check equality of the schema version strings?
  3. convert all entries in hed_strings from str type to HedString type using provided schema version(s), and use the hed library to then assess equality of the HedString instances?

My original question of "should we care about schema version when testing equality of HEDAnnotations objects?" could be rephrased as "should be just do (1) or should we also do (2)?" but I'm now adding option (3) for clarity, since you've explained how equality is tested in your library.

I'll note that it's not (yet) obvious to me that there's added value from the extra computations involved in (3) in the context of testing equality of HEDAnnotations objects, so if you think (3) is the best choice, could you explain why you think so (perhaps by giving an example where 2 identical strings would be parsed as meaningfully different under different schema versions)?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. and 2) will definitely not work. HED strings are unordered, so (A, (B, C)) is the same as ((C,B), A). There is only one option. At some point you have to convert to HedString objects and apply sort. It is possible to dump out the sorted form as a "canonical" str. I would not recommend doing this until after it validates, since users like to see where in the string an error occurs. You have to convert to HedString to validate, so there might be a convenient time there to store the canonical form in the annotation.

The only option is to compare as HedString

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. (A, (B, C)) is the same as ((C,B), A)

OK! that's pretty clear motivation for (3).

One more question: by the time we're checking equality of HEDAnnotations objects, we'll already have validated the hed_strings variable (validation happens upon object creation, or whenever the strings are changed). As you say, at validation time we have the option of converting to or storing (in a separate attribute) the canonical strings and/or the HedString objects as part of the HEDAnnotations object. I lean away from storing the HedString objects inside the HEDAnnotations object because it will make saving to disk in .fif format much more complicated. But I like the idea of storing the canonical-form strings in a separate (private) attribute. Then, during equality checking, could we just compare the canonical strings and not need to re-convert to HedString (even if the schema versions used to validate the two objects were different)?

)

def __repr__(self):
"""Show a textual summary of the object."""
counter = Counter(self.hed_strings)
kinds = ", ".join(["{} ({})".format(*k) for k in sorted(counter.items())])
kinds = (": " if len(kinds) > 0 else "") + kinds
ch_specific = ", channel-specific" if self._any_ch_names() else ""
s = (
f"HEDAnnotations | {len(self.onset)} segment"
f"{_pl(len(self.onset))}{ch_specific}{kinds}"
)
return "<" + shorten(s, width=77, placeholder=" ...") + ">"

def __getitem__(self, key, *, with_ch_names=None):
"""Propagate indexing and slicing to the underlying numpy structure."""
result = super().__getitem__(self, key, with_ch_names=with_ch_names)
if isinstance(result, OrderedDict):
result["hed_strings"] = self.hed_strings[key]
else:
key = list(key) if isinstance(key, tuple) else key
hed_strings = self.hed_strings[key]
return HEDAnnotations(
result.onset,
result.duration,
result.description,
hed_strings,
hed_version=self.hed_version,
orig_time=self.orig_time,
ch_names=result.ch_names,
)

def append(self, onset, duration, description, ch_names=None):
"""TODO."""
pass

def count(self):
"""TODO. Unlike Annotations.count, keys should be HED tags not descriptions."""
pass

def crop(
self, tmin=None, tmax=None, emit_warning=False, use_orig_time=True, verbose=None
):
"""TODO."""
pass

def delete(self, idx):
"""TODO."""
pass

def to_data_frame(self, time_format="datetime"):
"""TODO."""
pass
Comment on lines +899 to +919
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VisLab these TODOs are for me. So as you can see some things aren't going to work yet, but we're already at least able to do:

$ ipython
In [1]: import mne
In [2]: foo = mne.HEDAnnotations([0, 1], [0.5, 1.2], ['foo', 'bar'], ['hed/foo', 'hed/
   ...: bar'])
In [3]: foo
Out[3]: <HEDAnnotations | 2 segments: hed/bar (1), hed/foo (1)>

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not completely sure what these do, but would be willing to help as needed. Would the get_annotations_per_epoch then have an additional list for HED annotations in the list of lists?

Thanks @drammock



class EpochAnnotationsMixin:
"""Mixin class for Annotations in Epochs."""

Expand Down
Loading