Merge pull request #486 from LAAC-LSCP/samplers/session_id

LoannPeurey · web-flow · commit 5c5c51c537f2 · 2024-11-21T09:01:47.000-06:00
Samplers/session
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
 - conversations summary extraction pipeline
 - docs and tests for init command
 - docs and tests for automated-import command
+- option '--by' for periodic sampling along with doc and tests
 
 ### Fixed
 
diff --git a/ChildProject/pipelines/samplers.py b/ChildProject/pipelines/samplers.py
@@ -224,6 +224,7 @@ def __init__(
         period: int,
         offset: int = 0,
         profile: str = None,
+        by: str = "recording_filename",
         recordings: Union[str, List[str], pd.DataFrame] = None,
         exclude: Union[str, pd.DataFrame] = None,
     ):
@@ -233,6 +234,7 @@ def __init__(
         self.period = int(period)
         self.offset = int(offset)
         self.profile = profile
+        self.by = by
 
     def _sample(self):
         recordings = self.project.get_recordings_from_list(self.recordings)
@@ -250,23 +252,34 @@ def _sample(self):
 
         recordings["duration"].fillna(0, inplace=True)
 
-        self.segments = recordings[["recording_filename", "duration"]].copy()
-        self.segments["segment_onset"] = self.segments.apply(
-            lambda row: np.arange(
-                self.offset,
-                row["duration"] - self.length + 1e-4,
+        recordings = recordings.copy()
+        recordings['start_ts'] = recordings.apply(
+            lambda row: int(pd.Timestamp(str(row['date_iso']) + 'T' + str(row['start_time'])).timestamp()) * 1000,
+            axis=1)
+        recordings['end_ts'] = recordings['start_ts'] + recordings['duration']
+        segments = []
+        # work by groups (by argument), create a singular timeline from those groups and choose periodic segments from there
+        # this means that recordings following each other will maintain continuity in sampling period
+        # also means concurrent recordings in the same session will have the same samples kept time/date wise regardless of shifts in start
+        for i, gdf in recordings.groupby(self.by):
+            all_segments = pd.DataFrame({'segment_onset': np.arange(
+                gdf['start_ts'].min() + self.offset,
+                gdf['end_ts'].max(),
                 self.period + self.length,
-            ),
-            axis=1,
-        )
-        self.segments = self.segments.explode("segment_onset")
-        # discard recordings that can't include segments (they are NA here bc explode keeps empty lists)
-        self.segments = self.segments.dropna(subset=['segment_onset'])
-        self.segments["segment_onset"] = self.segments["segment_onset"].astype(int)
-        self.segments["segment_offset"] = self.segments["segment_onset"] + self.length
-        self.segments.rename(
-            columns={"recording_filename": "recording_filename"}, inplace=True
-        )
+            )})
+            all_segments['segment_offset'] = all_segments['segment_onset'] + self.length
+            rec_segments = []
+            for recording in gdf.to_dict(orient='records'):
+                tmp_segs = all_segments[(all_segments['segment_offset'] > recording['start_ts']) & (all_segments['segment_onset'] < recording['end_ts'])].copy()
+                # cut down overflowing stamps
+                tmp_segs['segment_onset'] = tmp_segs['segment_onset'].apply(lambda x: max(x, recording['start_ts']))
+                tmp_segs['segment_offset'] = tmp_segs['segment_offset'].apply(lambda x: min(x, recording['end_ts']))
+                tmp_segs['segment_onset'] = tmp_segs['segment_onset'] - recording['start_ts']
+                tmp_segs['segment_offset'] = tmp_segs['segment_offset'] - recording['start_ts']
+                tmp_segs['recording_filename'] = recording['recording_filename']
+                rec_segments.append(tmp_segs)
+            segments.append(pd.concat(rec_segments))
+        self.segments = pd.concat(segments)
 
         return self.segments
 
@@ -297,6 +310,12 @@ def add_parser(subparsers, subcommand):
             default="",
             type=str,
         )
+        parser.add_argument(
+            "--by",
+            help="units to sample from (default behavior is to sample by recording)",
+            choices=["recording_filename", "session_id", "child_id"],
+            default="recording_filename",
+        )
 
 
 class RandomVocalizationSampler(Sampler):
diff --git a/docs/source/samplers.rst b/docs/source/samplers.rst
@@ -35,7 +35,17 @@ All samplers have a few parameters in common:
 Periodic sampler
 ~~~~~~~~~~~~~~~~
 
-Draw segments from the recordings, periodically
+Draw segments from the recordings periodically.
+
+The ``--period`` argument is between the end of the previous segment until the start of the next. For example
+length:60000(1min) period:3540000(59min) will sample the first minute of every hour whereas length:60000(1min)
+period:3600000(1h) will sample the 1st min of the 1st hour, the 2nd min o the 2nd hour and so on.
+
+The ``--by`` argument will group recordings to form a single timeline in which the periodicity defines the parts
+to annotate, then those parts are extracted from the recordings of the group. this means that recordings following
+each other will maintain continuity in sampling period if in the same session and sampling by session_id. It also means
+concurrent recordings in the same session will have the same samples kept time/date wise regardless of shifts in start.
+The default is to sample by 'recording_filename' which will simply periodicly sample each recording independently.
 
 .. clidoc::
 
diff --git a/examples/valid_raw_data/metadata/recordings.csv b/examples/valid_raw_data/metadata/recordings.csv
@@ -1,4 +1,4 @@
-child_id,recording_device_type,experiment,notes,recording_filename,start_time,noisy_setting,date_iso,duration,discard
-1,usb,test,VERY confidential notes,sound.wav,9:00,1,2020-04-20,4000,
-1,usb,test,,sound2.wav,9:00,1,2020-04-21,4000,0
-1,usb,test,discarded audio,sound3.wav,10:00,0,2020-04-22,10000,1
+child_id,recording_device_type,experiment,notes,recording_filename,start_time,noisy_setting,date_iso,duration,discard,session_id
+1,usb,test,VERY confidential notes,sound.wav,9:00,1,2020-04-20,4000,,s1
+1,usb,test,,sound2.wav,9:00,1,2020-04-21,4000,0,s1
+1,usb,test,discarded audio,sound3.wav,10:00,0,2020-04-22,10000,1,s1
diff --git a/tests/test_samplers.py b/tests/test_samplers.py
@@ -3,6 +3,7 @@
 import pytest
 import shutil
 from functools import partial
+from pathlib import Path
 
 from ChildProject.projects import ChildProject
 from ChildProject.annotations import AnnotationManager
@@ -15,32 +16,39 @@
     SamplerPipeline,
 )
 
+TRUTH = Path('tests', 'truth')
+PATH = Path('output', 'samplers')
 
 def fake_conversation(data, filename):
     return data
 
 
 @pytest.fixture(scope="function")
 def project(request):
-    if not os.path.exists("output/samplers"):
-        shutil.copytree(src="examples/valid_raw_data", dst="output/samplers")
+    if os.path.exists(PATH):
+        # shutil.copytree(src="examples/valid_raw_data", dst="output/annotations")
+        shutil.rmtree(PATH)
+    shutil.copytree(src="examples/valid_raw_data", dst=PATH)
 
-    project = ChildProject("output/samplers")
+    project = ChildProject(PATH)
     project.read()
-    yield project
 
+    yield project
 
-def test_periodic(project):
+@pytest.mark.parametrize("by,truth",
+                         [('recording_filename', TRUTH / 'sampler' / 'periodic_rec.csv'),
+                          ('session_id', TRUTH / 'sampler' / 'periodic_sess.csv'),
+                          ])
+def test_periodic(project, by, truth):
     sampler = PeriodicSampler(
-        project=project, length=1000, period=1000, recordings=["sound.wav"]
+        project=project, offset=1000, length=500, period=200, recordings=["sound.wav",'sound2.wav'], by=by
     )
     sampler.sample()
 
-    duration = project.recordings[
-        project.recordings["recording_filename"] == "sound.wav"
-    ]["duration"].iloc[0]
+    # sampler.segments.to_csv(truth, index=False)
+    truth = pd.read_csv(truth)
 
-    assert len(sampler.segments) == int(duration / (1000 + 1000))
+    pd.testing.assert_frame_equal(sampler.segments.reset_index(drop=True), truth, check_like=True)
 
 
 def test_energy_detection(project):
diff --git a/tests/truth/sampler/periodic_rec.csv b/tests/truth/sampler/periodic_rec.csv
@@ -0,0 +1,11 @@
+segment_onset,segment_offset,recording_filename
+1000,1500,sound.wav
+1700,2200,sound.wav
+2400,2900,sound.wav
+3100,3600,sound.wav
+3800,4000,sound.wav
+1000,1500,sound2.wav
+1700,2200,sound2.wav
+2400,2900,sound2.wav
+3100,3600,sound2.wav
+3800,4000,sound2.wav
diff --git a/tests/truth/sampler/periodic_sess.csv b/tests/truth/sampler/periodic_sess.csv
@@ -0,0 +1,12 @@
+segment_onset,segment_offset,recording_filename
+1000,1500,sound.wav
+1700,2200,sound.wav
+2400,2900,sound.wav
+3100,3600,sound.wav
+3800,4000,sound.wav
+0,400,sound2.wav
+600,1100,sound2.wav
+1300,1800,sound2.wav
+2000,2500,sound2.wav
+2700,3200,sound2.wav
+3400,3900,sound2.wav