Add fix_manifests in all recipes (#1128)

* add transform attribute for MixedCut * add mix_first option in normalize_loudness * handle the case when mix is called on MixedCut with existing transforms * add test for mixing with transformed MixedCut * enhancements and bug fixes * small changes in some cutset methods * small fix in error message * return word alignments from ami recipe * add word alignments for ICSI * remove unwanted whitespace * fix IHM preparation * remove words with zero or negative duration * ensure word alignments respect segment boundary * add save-to-wav option for icsi * add test for mixing cut with recording * style fix * add data prep for voxpopuli * add fix_manifests for all recipes
lhotse-speech · Aug 24, 2023 · 6914818 · 6914818
1 parent c6fa990
commit 6914818
Show file tree

Hide file tree

Showing 56 changed files with 273 additions and 140 deletions.
diff --git a/lhotse/recipes/adept.py b/lhotse/recipes/adept.py
@@ -34,6 +34,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, resumable_download
 
 ADEPT_URL = "https://zenodo.org/record/5117102/files/ADEPT.zip"
@@ -140,6 +141,7 @@ def prepare_adept(
         )
 
     supervisions = SupervisionSet.from_segments(supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:

diff --git a/lhotse/recipes/aidatatang_200zh.py b/lhotse/recipes/aidatatang_200zh.py
@@ -20,6 +20,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
@@ -135,6 +136,7 @@ def prepare_aidatatang_200zh(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:

diff --git a/lhotse/recipes/aishell.py b/lhotse/recipes/aishell.py
@@ -16,6 +16,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
@@ -140,6 +141,7 @@ def prepare_aishell(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:

diff --git a/lhotse/recipes/aishell2.py b/lhotse/recipes/aishell2.py
@@ -11,6 +11,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
 
@@ -73,7 +74,7 @@ def text_normalize(line: str) -> str:
     IC0975W0451 明年二月底小成
     ID0114W0368 我感觉就是在不断拉抽屉
     ID0115W0198 我公司员工不存在持有和泰创投股份的情况
-    
+
     """
     new_line = []
     line = list(line)
@@ -161,6 +162,7 @@ def prepare_aishell2(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:

diff --git a/lhotse/recipes/aishell3.py b/lhotse/recipes/aishell3.py
@@ -25,6 +25,7 @@
     validate_recordings_and_supervisions,
 )
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
@@ -159,7 +160,7 @@ def prepare_aishell3(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
-
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:

diff --git a/lhotse/recipes/aishell4.py b/lhotse/recipes/aishell4.py
@@ -35,6 +35,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, is_module_available, resumable_download, safe_extract
 
@@ -174,6 +175,7 @@ def prepare_aishell4(
 
         recording_set = RecordingSet.from_recordings(recordings)
         supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:

diff --git a/lhotse/recipes/ali_meeting.py b/lhotse/recipes/ali_meeting.py
@@ -25,8 +25,9 @@
 
 from tqdm import tqdm
 
-from lhotse import fix_manifests, validate_recordings_and_supervisions
+from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import normalize_text_alimeeting
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, is_module_available, resumable_download, safe_extract
@@ -204,11 +205,11 @@ def prepare_ali_meeting(
                         )
                         supervisions.append(segment)
 
+        # Fix manifests
         recording_set, supervision_set = fix_manifests(
             RecordingSet.from_recordings(recordings),
             SupervisionSet.from_segments(supervisions),
         )
-        # Fix manifests
         validate_recordings_and_supervisions(recording_set, supervision_set)
 
         if output_dir is not None:

diff --git a/lhotse/recipes/aspire.py b/lhotse/recipes/aspire.py
@@ -34,8 +34,9 @@
 from pathlib import Path
 from typing import Dict, NamedTuple, Optional, Union
 
-from lhotse import fix_manifests, validate_recordings_and_supervisions
+from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import AudioSource, Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, Seconds
 

diff --git a/lhotse/recipes/atcosim.py b/lhotse/recipes/atcosim.py
@@ -18,6 +18,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import (
     Pathlike,
@@ -245,4 +246,9 @@ def prepare_atcosim(
 
     recordings = RecordingSet.from_jsonl_lazy(recs_writer.path)
     supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path)
+
+    logging.warning(
+        "Manifests are lazily materialized. You may want to call `lhotse.qa.fix_manifests()`"
+        " to ensure that all supervisions fall within the corresponding recordings."
+    )
     return recordings, supervisions
diff --git a/lhotse/recipes/audio_mnist.py b/lhotse/recipes/audio_mnist.py
@@ -27,6 +27,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.serialization import load_json
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, resumable_download
@@ -132,6 +133,7 @@ def prepare_audio_mnist(
         )
 
     supervisions = SupervisionSet.from_segments(supervisions)
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:

diff --git a/lhotse/recipes/bengaliai_speech.py b/lhotse/recipes/bengaliai_speech.py
@@ -30,6 +30,7 @@
     set_ffmpeg_torchaudio_info_enabled,
 )
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
 from lhotse.recipes.utils import manifests_exist
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -189,6 +190,10 @@ def prepare_bengaliai_speech(
             num_jobs=num_jobs,
         )
 
+        # Fix manifests
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
         if output_dir is not None:
             supervision_set.to_file(
                 output_dir / f"bengaliai_speech_supervisions_{part}.jsonl.gz"

diff --git a/lhotse/recipes/broadcast_news.py b/lhotse/recipes/broadcast_news.py
@@ -19,6 +19,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, check_and_rglob, recursion_limit
 
@@ -65,6 +66,7 @@ def prepare_broadcast_news(
         chain.from_iterable(sups["segments"] for sups in supervisions_list)
     )
 
+    recordings, segment_supervisions = fix_manifests(recordings, segment_supervisions)
     validate_recordings_and_supervisions(recordings, segment_supervisions)
 
     if output_dir is not None:

diff --git a/lhotse/recipes/bvcc.py b/lhotse/recipes/bvcc.py
@@ -9,6 +9,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike
 
 
@@ -76,6 +77,11 @@ def prepare_bvcc(
         )
     )
     main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup)
+
+    # Fix manifests
+    main1_dev_recs, main1_dev_sup = fix_manifests(main1_dev_recs, main1_dev_sup)
+    validate_recordings_and_supervisions(main1_dev_recs, main1_dev_sup)
+
     manifests["main1_dev"] = {
         "recordings": main1_dev_recs,
         "supervisions": main1_dev_sup,
@@ -90,6 +96,11 @@ def prepare_bvcc(
         )
     )
     main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup)
+
+    # Fix manifests
+    main1_train_recs, main1_train_sup = fix_manifests(main1_train_recs, main1_train_sup)
+    validate_recordings_and_supervisions(main1_train_recs, main1_train_sup)
+
     manifests["main1_train"] = {
         "recordings": main1_train_recs,
         "supervisions": main1_train_sup,
@@ -134,6 +145,11 @@ def prepare_bvcc(
         )
     )
     ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup)
+
+    # Fix_manifests
+    ood1_dev_recs, ood1_dev_sup = fix_manifests(ood1_dev_recs, ood1_dev_sup)
+    validate_recordings_and_supervisions(ood1_dev_recs, ood1_dev_sup)
+
     manifests["ood1_dev"] = {
         "recordings": ood1_dev_recs,
         "supervisions": ood1_dev_sup,
@@ -148,6 +164,11 @@ def prepare_bvcc(
         )
     )
     ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup)
+
+    # Fix manifests
+    ood1_train_recs, ood1_train_sup = fix_manifests(ood1_train_recs, ood1_train_sup)
+    validate_recordings_and_supervisions(ood1_train_recs, ood1_train_sup)
+
     manifests["ood1_train"] = {
         "recordings": ood1_train_recs,
         "supervisions": ood1_train_sup,

diff --git a/lhotse/recipes/cmu_arctic.py b/lhotse/recipes/cmu_arctic.py
@@ -35,7 +35,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
-from lhotse.qa import remove_missing_recordings_and_supervisions
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
 BASE_URL = "http://festvox.org/cmu_arctic/packed/"
@@ -167,9 +167,7 @@ def prepare_cmu_arctic(
     supervisions = SupervisionSet.from_segments(supervisions)
 
     # There seem to be 20 recordings missing; remove the before validation
-    recordings, supervisions = remove_missing_recordings_and_supervisions(
-        recordings, supervisions
-    )
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:

diff --git a/lhotse/recipes/cmu_indic.py b/lhotse/recipes/cmu_indic.py
@@ -30,7 +30,7 @@
     SupervisionSet,
     validate_recordings_and_supervisions,
 )
-from lhotse.qa import remove_missing_recordings_and_supervisions
+from lhotse.qa import fix_manifests
 from lhotse.utils import Pathlike, resumable_download, safe_extract
 
 BASE_URL = "http://festvox.org/h2r_indic/"
@@ -194,9 +194,7 @@ def prepare_cmu_indic(
     supervisions = SupervisionSet.from_segments(supervisions)
 
     # There seem to be 20 recordings missing; remove the before validation
-    recordings, supervisions = remove_missing_recordings_and_supervisions(
-        recordings, supervisions
-    )
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     if output_dir is not None:

diff --git a/lhotse/recipes/cmu_kids.py b/lhotse/recipes/cmu_kids.py
@@ -3,22 +3,22 @@
 
 Summary of corpus from LDC webpage:
 
-This database is comprised of sentences read aloud by children. It was originally designed 
-in order to create a training set of children's speech for the SPHINX II automatic speech 
+This database is comprised of sentences read aloud by children. It was originally designed
+in order to create a training set of children's speech for the SPHINX II automatic speech
 recognizer for its use in the LISTEN project at Carnegie Mellon University.
 
-The children range in age from six to eleven (see details below) and were in first through 
-third grades (the 11-year-old was in 6th grade) at the time of recording. There were 24 male 
+The children range in age from six to eleven (see details below) and were in first through
+third grades (the 11-year-old was in 6th grade) at the time of recording. There were 24 male
 and 52 female speakers. There are 5,180 utterances in all.
 
 The speakers come from two separate populations:
 
- 1. SIM95: They were recorded in the summer of 1995 and were enrolled in either the Chatham 
-    College Summer Camp or the Mount Lebanon Extended Day Summer Fun program in Pittsburgh. 
+ 1. SIM95: They were recorded in the summer of 1995 and were enrolled in either the Chatham
+    College Summer Camp or the Mount Lebanon Extended Day Summer Fun program in Pittsburgh.
     They were recorded on-site. There are 44 speakers and 3,333 utterances in this set. They
     "good" reading examples.
- 2. FP: These are examples of errorful reading and dialectic variants. The readers come from 
-    Fort Pitt School in Pittsburgh and were recorded in April 1996. There are 32 speakers and 
+ 2. FP: These are examples of errorful reading and dialectic variants. The readers come from
+    Fort Pitt School in Pittsburgh and were recorded in April 1996. There are 32 speakers and
     1,847 utterances in this set.
 
 The user should be aware that the speakers' dialect partly reflects what is locally called "Pittsburghese."
@@ -36,6 +36,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
 
@@ -129,6 +130,7 @@ def prepare_cmu_kids(
     recordings = RecordingSet.from_recordings(recordings)
     supervisions = SupervisionSet.from_segments(supervisions)
 
+    recordings, supervisions = fix_manifests(recordings, supervisions)
     validate_recordings_and_supervisions(recordings, supervisions)
 
     manifests = {

diff --git a/lhotse/recipes/commonvoice.py b/lhotse/recipes/commonvoice.py
@@ -30,6 +30,7 @@
     validate_recordings_and_supervisions,
 )
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, is_module_available, resumable_download, safe_extract
 
@@ -305,6 +306,12 @@ def prepare_commonvoice(
                 num_jobs=num_jobs,
             )
 
+            # Fix manifests
+            recording_set, supervision_set = fix_manifests(
+                recording_set, supervision_set
+            )
+            validate_recordings_and_supervisions(recording_set, supervision_set)
+
             supervision_set.to_file(
                 output_dir / f"cv-{lang}_supervisions_{part}.jsonl.gz"
             )

diff --git a/lhotse/recipes/csj.py b/lhotse/recipes/csj.py
@@ -116,6 +116,7 @@
 
 from lhotse import validate_recordings_and_supervisions
 from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike
@@ -889,6 +890,10 @@ def prepare_manifests(
 
             recording_set = RecordingSet.from_recordings(recordings)
             supervision_set = SupervisionSet.from_segments(supervisions)
+
+            recording_set, supervision_set = fix_manifests(
+                recording_set, supervision_set
+            )
             validate_recordings_and_supervisions(recording_set, supervision_set)
 
             if manifest_dir: