@@ -279,6 +279,42 @@ class AnnotationManager:
279
279
IndexColumn (name = "utterances" , description = "LENA utterances details (json)" ),
280
280
IndexColumn (name = "cries" , description = "cries (json)" ),
281
281
IndexColumn (name = "vfxs" , description = "Vfx (json)" ),
282
+
283
+ ]
284
+
285
+ SETS_COLUMNS = [
286
+ IndexColumn (name = "segmentation" , description = "source of the segmentation. `self` if produces its own, name of \
287
+ another set if using an other's set segmentation" , dtype = "str" ),
288
+ IndexColumn (name = "method" , description = "Method used for the annotations, automated, human or a mix of both" ,
289
+ choices = ['automated' , 'human' , 'mixed' ]),
290
+ IndexColumn (name = "annotator_name" , description = "unique name for human annotators" ),
291
+ IndexColumn (name = "annotator_experience" , description = "Estimation of annotator's experience from 1 to 5. \
292
+ 1 being 'new to annotation' and 5 'Expert'." , dtype = "int" , choices = [1 , 2 , 3 , 4 , 5 ]),
293
+ IndexColumn (name = "annotation_algorithm_name" , description = "name of the algorithm" , dtype = "str" ,
294
+ choices = ['VTC' , 'ALICE' , 'VCM' , 'ITS' ]),
295
+ IndexColumn (name = "annotation_algorithm_publication" , description = "scientific publication citation for the \
296
+ algorithm used" , dtype = "str" ),
297
+ IndexColumn (name = "annotation_algorithm_version" , description = "¨version of the algorithm" ),
298
+ IndexColumn (name = "annotation_algorithm_repo" , description = "link to repository where the algorithm is stored. \
299
+ Ideally along with a commit hash for more reproducibility." , dtype = "str" ),
300
+ IndexColumn (name = "date_annotation" , description = "date when the annotation was produced, best practice is to \
301
+ give the day the annotation was finished. This is meant to be a broad time label and does not need to be very \
302
+ precise" , datetime = "%Y-%m-%d" ),
303
+
304
+ IndexColumn (name = "has_speaker_type" , description = "Does the set contain the type of speakers. Yes(Y) / \
305
+ No(N or empty)" , choices = ['Y' , 'N' , '' ]),
306
+ IndexColumn (name = "has_trancription" , description = "Does the set contain transcriptions. Yes(Y) / No(N or empty)" ,
307
+ choices = ['Y' , 'N' , '' ]),
308
+ IndexColumn (name = "has_interactions" , description = "Does the set contain information about interactions between \
309
+ speakers. Yes(Y) / No(N or empty)" , choices = ['Y' , 'N' , '' ]),
310
+ IndexColumn (name = "has_acoustics" , description = "Does the set contain information about acoustic features of \
311
+ speakers. Yes(Y) / No(N or empty)" , choices = ['Y' , 'N' , '' ]),
312
+ IndexColumn (name = "has_addressee" , description = "Does the set contain the information of who the vocalization is \
313
+ addressed to. Yes(Y) / No(N or empty)" , choices = ['Y' , 'N' , '' ]),
314
+ IndexColumn (name = "has_vcm" , description = "Does the set contain information about vocal maturity of vocalizations \
315
+ . Yes(Y) / No(N or empty)" , choices = ['Y' , 'N' , '' ]),
316
+ IndexColumn (name = "has_words" , description = "Does the set contain information about number of words contained \
317
+ . Yes(Y) / No(N or empty)" , choices = ['Y' , 'N' , '' ]),
282
318
]
283
319
284
320
def __init__ (self , project : ChildProject ):
@@ -367,6 +403,39 @@ def read(self) -> Tuple[List[str], List[str]]:
367
403
return errors , warnings
368
404
369
405
406
+ def read_sets_metadata (self ):
407
+ """
408
+ Read the metadata of sets detected inside annotations, will not read anything if the attribute
409
+ self.annotations is empty (so do `read()` first)
410
+
411
+ @return:
412
+ """
413
+ sets = self .annotations ['set' ].unique ()
414
+ known_fields = [c .name for c in self .SETS_COLUMNS ]
415
+
416
+ sets_metadata = []
417
+ for set in sets :
418
+ expected_path = self .project .path / ANNOTATIONS / set / METANNOTS
419
+
420
+ if expected_path .exists ():
421
+ with open (expected_path , 'r' ) as meta_stream :
422
+ sets_metadata .append (yaml .load (meta_stream ))
423
+ elif os .path .lexists (expected_path ):
424
+ # warnings should probably be grouped for all sets that don't have their metadata file
425
+ warnings .append (f"Metadata file content for set { set } at { expected_path } could not be found, it may \
426
+ be downloaded from a remote with the command `datalad get { expected_path } `" )
427
+ else :
428
+ warnings .append (f"Metadata file for set { set } at { expected_path } could not be found, it should \
429
+ be created" )
430
+ # warning metadata file for the set does not exist (this should produce 1 warning for all probably)
431
+
432
+ sets_metadata = pd .DataFrame (sets_metadata )
433
+
434
+
435
+ self .sets = sets_metadata
436
+
437
+ return errors , warnings
438
+
370
439
def validate_annotation (self , annotation : dict ) -> Tuple [List [str ], List [str ]]:
371
440
logger_annotations .info ("Validating %s from %s..." , annotation ["annotation_filename" ], annotation ["set" ])
372
441
segments = IndexTable (
0 commit comments