Skip to content

Commit

Permalink
starting indexing of sets metadata fields, first pass at a reading fu…
Browse files Browse the repository at this point in the history
…nction
  • Loading branch information
LoannPeurey committed Dec 2, 2024
1 parent 05da630 commit afd0fed
Showing 1 changed file with 69 additions and 0 deletions.
69 changes: 69 additions & 0 deletions ChildProject/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,42 @@ class AnnotationManager:
IndexColumn(name="utterances", description="LENA utterances details (json)"),
IndexColumn(name="cries", description="cries (json)"),
IndexColumn(name="vfxs", description="Vfx (json)"),

]

SETS_COLUMNS = [
IndexColumn(name="segmentation", description="source of the segmentation. `self` if produces its own, name of \
another set if using an other's set segmentation", dtype="str"),
IndexColumn(name="method", description="Method used for the annotations, automated, human or a mix of both",
choices=['automated', 'human', 'mixed']),
IndexColumn(name="annotator_name", description="unique name for human annotators"),
IndexColumn(name="annotator_experience", description="Estimation of annotator's experience from 1 to 5. \
1 being 'new to annotation' and 5 'Expert'.", dtype="int", choices=[1, 2, 3, 4, 5]),
IndexColumn(name="annotation_algorithm_name", description="name of the algorithm", dtype="str",
choices=['VTC', 'ALICE', 'VCM', 'ITS']),
IndexColumn(name="annotation_algorithm_publication", description="scientific publication citation for the \
algorithm used", dtype="str"),
IndexColumn(name="annotation_algorithm_version", description="¨version of the algorithm"),
IndexColumn(name="annotation_algorithm_repo", description="link to repository where the algorithm is stored. \
Ideally along with a commit hash for more reproducibility.", dtype="str"),
IndexColumn(name="date_annotation", description="date when the annotation was produced, best practice is to \
give the day the annotation was finished. This is meant to be a broad time label and does not need to be very \
precise", datetime="%Y-%m-%d"),

IndexColumn(name="has_speaker_type", description="Does the set contain the type of speakers. Yes(Y) / \
No(N or empty)", choices=['Y', 'N', '']),
IndexColumn(name="has_trancription", description="Does the set contain transcriptions. Yes(Y) / No(N or empty)",
choices=['Y', 'N', '']),
IndexColumn(name="has_interactions", description="Does the set contain information about interactions between \
speakers. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
IndexColumn(name="has_acoustics", description="Does the set contain information about acoustic features of \
speakers. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
IndexColumn(name="has_addressee", description="Does the set contain the information of who the vocalization is \
addressed to. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
IndexColumn(name="has_vcm", description="Does the set contain information about vocal maturity of vocalizations \
. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
IndexColumn(name="has_words", description="Does the set contain information about number of words contained \
. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
]

def __init__(self, project: ChildProject):
Expand Down Expand Up @@ -367,6 +403,39 @@ def read(self) -> Tuple[List[str], List[str]]:
return errors, warnings


def read_sets_metadata(self):
"""
Read the metadata of sets detected inside annotations, will not read anything if the attribute
self.annotations is empty (so do `read()` first)
@return:
"""
sets = self.annotations['set'].unique()
known_fields = [c.name for c in self.SETS_COLUMNS]

sets_metadata = []
for set in sets:
expected_path = self.project.path / ANNOTATIONS / set / METANNOTS

if expected_path.exists():
with open(expected_path, 'r') as meta_stream:
sets_metadata.append(yaml.load(meta_stream))
elif os.path.lexists(expected_path):
# warnings should probably be grouped for all sets that don't have their metadata file
warnings.append(f"Metadata file content for set {set} at {expected_path} could not be found, it may \
be downloaded from a remote with the command `datalad get {expected_path}`")
else:
warnings.append(f"Metadata file for set {set} at {expected_path} could not be found, it should \
be created")
# warning metadata file for the set does not exist (this should produce 1 warning for all probably)

sets_metadata = pd.DataFrame(sets_metadata)


self.sets = sets_metadata

return errors, warnings

def validate_annotation(self, annotation: dict) -> Tuple[List[str], List[str]]:
logger_annotations.info("Validating %s from %s...", annotation["annotation_filename"], annotation["set"])
segments = IndexTable(
Expand Down

0 comments on commit afd0fed

Please sign in to comment.