Skip to content

Commit afd0fed

Browse files
committed
starting indexing of sets metadata fields, first pass at a reading function
1 parent 05da630 commit afd0fed

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

ChildProject/annotations.py

+69
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,42 @@ class AnnotationManager:
279279
IndexColumn(name="utterances", description="LENA utterances details (json)"),
280280
IndexColumn(name="cries", description="cries (json)"),
281281
IndexColumn(name="vfxs", description="Vfx (json)"),
282+
283+
]
284+
285+
SETS_COLUMNS = [
286+
IndexColumn(name="segmentation", description="source of the segmentation. `self` if produces its own, name of \
287+
another set if using an other's set segmentation", dtype="str"),
288+
IndexColumn(name="method", description="Method used for the annotations, automated, human or a mix of both",
289+
choices=['automated', 'human', 'mixed']),
290+
IndexColumn(name="annotator_name", description="unique name for human annotators"),
291+
IndexColumn(name="annotator_experience", description="Estimation of annotator's experience from 1 to 5. \
292+
1 being 'new to annotation' and 5 'Expert'.", dtype="int", choices=[1, 2, 3, 4, 5]),
293+
IndexColumn(name="annotation_algorithm_name", description="name of the algorithm", dtype="str",
294+
choices=['VTC', 'ALICE', 'VCM', 'ITS']),
295+
IndexColumn(name="annotation_algorithm_publication", description="scientific publication citation for the \
296+
algorithm used", dtype="str"),
297+
IndexColumn(name="annotation_algorithm_version", description="¨version of the algorithm"),
298+
IndexColumn(name="annotation_algorithm_repo", description="link to repository where the algorithm is stored. \
299+
Ideally along with a commit hash for more reproducibility.", dtype="str"),
300+
IndexColumn(name="date_annotation", description="date when the annotation was produced, best practice is to \
301+
give the day the annotation was finished. This is meant to be a broad time label and does not need to be very \
302+
precise", datetime="%Y-%m-%d"),
303+
304+
IndexColumn(name="has_speaker_type", description="Does the set contain the type of speakers. Yes(Y) / \
305+
No(N or empty)", choices=['Y', 'N', '']),
306+
IndexColumn(name="has_trancription", description="Does the set contain transcriptions. Yes(Y) / No(N or empty)",
307+
choices=['Y', 'N', '']),
308+
IndexColumn(name="has_interactions", description="Does the set contain information about interactions between \
309+
speakers. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
310+
IndexColumn(name="has_acoustics", description="Does the set contain information about acoustic features of \
311+
speakers. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
312+
IndexColumn(name="has_addressee", description="Does the set contain the information of who the vocalization is \
313+
addressed to. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
314+
IndexColumn(name="has_vcm", description="Does the set contain information about vocal maturity of vocalizations \
315+
. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
316+
IndexColumn(name="has_words", description="Does the set contain information about number of words contained \
317+
. Yes(Y) / No(N or empty)", choices=['Y', 'N', '']),
282318
]
283319

284320
def __init__(self, project: ChildProject):
@@ -367,6 +403,39 @@ def read(self) -> Tuple[List[str], List[str]]:
367403
return errors, warnings
368404

369405

406+
def read_sets_metadata(self):
407+
"""
408+
Read the metadata of sets detected inside annotations, will not read anything if the attribute
409+
self.annotations is empty (so do `read()` first)
410+
411+
@return:
412+
"""
413+
sets = self.annotations['set'].unique()
414+
known_fields = [c.name for c in self.SETS_COLUMNS]
415+
416+
sets_metadata = []
417+
for set in sets:
418+
expected_path = self.project.path / ANNOTATIONS / set / METANNOTS
419+
420+
if expected_path.exists():
421+
with open(expected_path, 'r') as meta_stream:
422+
sets_metadata.append(yaml.load(meta_stream))
423+
elif os.path.lexists(expected_path):
424+
# warnings should probably be grouped for all sets that don't have their metadata file
425+
warnings.append(f"Metadata file content for set {set} at {expected_path} could not be found, it may \
426+
be downloaded from a remote with the command `datalad get {expected_path}`")
427+
else:
428+
warnings.append(f"Metadata file for set {set} at {expected_path} could not be found, it should \
429+
be created")
430+
# warning metadata file for the set does not exist (this should produce 1 warning for all probably)
431+
432+
sets_metadata = pd.DataFrame(sets_metadata)
433+
434+
435+
self.sets = sets_metadata
436+
437+
return errors, warnings
438+
370439
def validate_annotation(self, annotation: dict) -> Tuple[List[str], List[str]]:
371440
logger_annotations.info("Validating %s from %s...", annotation["annotation_filename"], annotation["set"])
372441
segments = IndexTable(

0 commit comments

Comments
 (0)