Merge branch 'master' of https://github.com/lhotse-speech/lhotse

lhotse-speech · Nov 7, 2024 · 86b7e79 · 86b7e79
2 parents 3b720c4 + 54bb42f
commit 86b7e79
Show file tree

Hide file tree

Showing 29 changed files with 2,821 additions and 87 deletions.
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -22,14 +22,14 @@ jobs:
           - python-version: "3.9"
             torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.10"
-            torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.10"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.11"
-            torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.11"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.12"
-            torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.12"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
 
       fail-fast: false

diff --git a/README.md b/README.md
@@ -116,7 +116,8 @@ Lhotse uses several environment variables to customize it's behavior. They are a
 
 ### Optional dependencies
 
-**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package like this: `pip install lhotse[package_name]`. The supported optional packages include:
+**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package:
+- `torchaudio` used to be a core dependency in Lhotse, but is now optional. Refer to [official PyTorch documentation for installation](https://pytorch.org/get-started/locally/).
 - `pip install lhotse[kaldi]` for a maximal feature set related to Kaldi compatibility. It includes libraries such as `kaldi_native_io` (a more efficient variant of `kaldi_io`) and `kaldifeat` that port some of Kaldi functionality into Python.
 - `pip install lhotse[orjson]` for up to 50% faster reading of JSONL manifests.
 - `pip install lhotse[webdataset]`. We support "compiling" your data into WebDataset tarball format for more effective IO. You can still interact with the data as if it was a regular lazy CutSet. To learn more, check out the following tutorial: [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lhotse-speech/lhotse/blob/master/examples/02-webdataset-integration.ipynb)

diff --git a/docs/conf.py b/docs/conf.py
@@ -78,4 +78,4 @@
     "exclude-members": "__weakref__",
 }
 
-autodoc_mock_imports = ["torchaudio", "SoundFile", "soundfile"]
+autodoc_mock_imports = ["SoundFile", "soundfile"]
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -109,6 +109,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_fisher_english`
   * - Fisher Spanish
     - :func:`lhotse.recipes.prepare_fisher_spanish`
+  * - FLEURS
+    - :func:`lhotse.recipes.prepare_fleurs`
   * - Fluent Speech Commands
     - :func:`lhotse.recipes.slu`
   * - GALE Arabic Broadcast Speech
@@ -173,6 +175,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_reazonspeech`
   * - RIRs and Noises Corpus (OpenSLR 28)
     - :func:`lhotse.recipes.prepare_rir_noise`
+  * - SBCSAE
+    - :func:`lhotse.recipes.prepare_sbcsae`
   * - Spatial-LibriSpeech
     - :func:`lhotse.recipes.prepare_spatial_librispeech`
   * - Speech Commands
@@ -207,6 +211,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_wenetspeech4tts`
   * - YesNo
     - :func:`lhotse.recipes.prepare_yesno`
+  * - Emilia
+    - :func:`lhotse.recipes.prepare_emilia`
   * - Eval2000
     - :func:`lhotse.recipes.prepare_eval2000`
   * - MGB2

diff --git a/docs/getting-started.rst b/docs/getting-started.rst
@@ -143,7 +143,9 @@ Lhotse uses several environment variables to customize it's behavior. They are a
 Optional dependencies
 *********************
 
-**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package like this: ``pip install lhotse[package_name]``. The supported optional packages include:
+**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package:
+
+* ``torchaudio`` used to be a core dependency in Lhotse, but is now optional. Refer to official PyTorch documentation for installation at `official Pytorch documentation for installation`_.
 
 * ``pip install lhotse[kaldi]`` for a maximal feature set related to Kaldi compatibility. It includes libraries such as ``kaldi_native_io`` (a more efficient variant of ``kaldi_io``) and ``kaldifeat`` that port some of Kaldi functionality into Python.
 
@@ -230,3 +232,4 @@ the speech starts roughly at the first second (100 frames):
 .. _Icefall recipes: https://github.com/k2-fsa/icefall
 .. _orjson: https://pypi.org/project/orjson/
 .. _AIStore: https://aiatscale.org
+.. _official Pytorch documentation for installation: https://pytorch.org/get-started/locally/
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.18.1
 sphinx_rtd_theme==2.0.0
-sphinx==7.2.6
+sphinx==7.1.2
 sphinx-click==5.1.0
 sphinx-autodoc-typehints==2.0.0
diff --git a/lhotse/audio/backend.py b/lhotse/audio/backend.py
@@ -808,7 +808,8 @@ def torchaudio_info(
 
     if torchaudio_ffmpeg_backend_available():
         # Torchaudio 2.1 with official "ffmpeg" backend should solve all the special cases below.
-        info = torchaudio.info(path_or_fileobj, backend="ffmpeg")
+        backend = "ffmpeg" if "ffmpeg" in torchaudio.list_audio_backends() else None
+        info = torchaudio.info(path_or_fileobj, backend=backend)
         return LibsndfileCompatibleAudioInfo(
             channels=info.num_channels,
             frames=info.num_frames,

diff --git a/lhotse/audio/recording.py b/lhotse/audio/recording.py
@@ -8,7 +8,7 @@
 import torch
 from _decimal import ROUND_HALF_UP
 
-from lhotse.audio.backend import info, save_audio, torchaudio_info
+from lhotse.audio.backend import get_current_audio_backend, info, save_audio
 from lhotse.audio.source import AudioSource
 from lhotse.audio.utils import (
     AudioLoadingError,
@@ -260,7 +260,7 @@ def from_bytes(
         :return: a new ``Recording`` instance that owns the byte string data.
         """
         stream = BytesIO(data)
-        audio_info = torchaudio_info(stream)
+        audio_info = get_current_audio_backend().info(stream)
         return Recording(
             id=recording_id,
             sampling_rate=audio_info.samplerate,

diff --git a/lhotse/bin/lhotse.py b/lhotse/bin/lhotse.py
@@ -1,22 +1,6 @@
 #!/usr/bin/env python3
 """
-Use this script like:
-
-$ lhotse --help
-$ lhotse make-feats --help
-$ lhotse make-feats --compressed recording_manifest.yml mfcc_dir/
-$ lhotse write-default-feature-config feat-conf.yml
-$ lhotse kaldi import data/train 16000 train_manifests/
-$ lhotse split 3 audio.yml split_manifests/
-$ lhotse combine feature.1.yml feature.2.yml combined_feature.yml
-$ lhotse recipe --help
-$ lhotse recipe librimix-dataprep path/to/librimix.csv output_manifests_dir/
-$ lhotse recipe librimix-obtain target_dir/
-$ lhotse recipe mini-librispeech-dataprep corpus_dir/ output_manifests_dir/
-$ lhotse recipe mini-librispeech-obtain target_dir/
-$ lhotse cut --help
-$ lhotse cut simple supervisions.yml features.yml simple_cuts.yml
-$ lhotse cut stereo-mixed supervisions.yml features.yml mixed_cuts.yml
+Use this script like: https://lhotse.readthedocs.io/en/latest/cli.html
 """
 
 # Note: we import all the CLI modes here so they get auto-registered

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -31,9 +31,11 @@
 from .earnings22 import *
 from .ears import *
 from .edacc import *
+from .emilia import *
 from .eval2000 import *
 from .fisher_english import *
 from .fisher_spanish import *
+from .fleurs import *
 from .gale_arabic import *
 from .gale_mandarin import *
 from .gigaspeech import *
@@ -65,8 +67,10 @@
 from .nsc import *
 from .peoples_speech import *
 from .primewords import *
+from .radio import *
 from .reazonspeech import *
 from .rir_noise import *
+from .sbcsae import *
 from .slu import *
 from .spatial_librispeech import *
 from .speechcommands import *

diff --git a/lhotse/bin/modes/recipes/emilia.py b/lhotse/bin/modes/recipes/emilia.py
@@ -0,0 +1,36 @@
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.emilia import prepare_emilia
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-l",
+    "--lang",
+    type=str,
+    help="The language to process. Valid values: zh, en, ja, ko, de, fr",
+)
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+def emilia(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    lang: str,
+    num_jobs: int = 1,
+):
+    """Prepare the Emilia corpus manifests."""
+    prepare_emilia(
+        corpus_dir=corpus_dir,
+        output_dir=output_dir,
+        lang=lang,
+        num_jobs=num_jobs,
+    )
diff --git a/lhotse/bin/modes/recipes/fleurs.py b/lhotse/bin/modes/recipes/fleurs.py
@@ -0,0 +1,68 @@
+from typing import Optional, Sequence, Union
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.fleurs import download_fleurs, prepare_fleurs
+from lhotse.utils import Pathlike
+
+__all__ = ["fleurs"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+@click.option(
+    "-l",
+    "--lang",
+    multiple=True,
+    default=["all"],
+    help="Specify which languages to prepare, e.g., "
+    "        lhoste prepare librispeech mtedx_corpus data -l de -l fr -l es ",
+)
+def fleurs(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    num_jobs: int,
+    lang: Optional[Union[str, Sequence[str]]],
+):
+    """Fleurs ASR data preparation."""
+    prepare_fleurs(corpus_dir, output_dir=output_dir, num_jobs=num_jobs, languages=lang)
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "-l",
+    "--lang",
+    multiple=True,
+    default=["all"],
+    help="Specify which languages to download, e.g., "
+    "        lhotse download fleurs . -l hi_in -l en_us "
+    "        lhotse download fleurs",
+)
+@click.option(
+    "--force-download",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Specify whether to overwrite an existing archive",
+)
+def fleurs(
+    target_dir: Pathlike,
+    lang: Optional[Union[str, Sequence[str]]],
+    force_download: bool = False,
+):
+    """FLEURS download."""
+    download_fleurs(
+        target_dir,
+        languages=lang,
+        force_download=force_download,
+    )
diff --git a/lhotse/bin/modes/recipes/ksponspeech.py b/lhotse/bin/modes/recipes/ksponspeech.py
@@ -39,6 +39,7 @@ def ksponspeech(
     output_dir: Pathlike,
     dataset_parts: Sequence[str],
     num_jobs: int,
+    normalize_text: str,
 ):
     """KsponSpeech ASR data preparation."""
     if len(dataset_parts) == 1:
@@ -48,4 +49,5 @@ def ksponspeech(
         output_dir=output_dir,
         num_jobs=num_jobs,
         dataset_parts=dataset_parts,
+        normalize_text=normalize_text,
     )
diff --git a/lhotse/bin/modes/recipes/radio.py b/lhotse/bin/modes/recipes/radio.py
@@ -0,0 +1,41 @@
+from typing import List, Optional, Sequence, Tuple, Union
+
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.radio import prepare_radio
+from lhotse.utils import Pathlike
+
+__all__ = ["radio"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(dir_okay=True))
+@click.argument("output_dir", type=click.Path(dir_okay=True))
+@click.option(
+    "-d",
+    "--min-seg-dur",
+    type=float,
+    default=0.5,
+    help="The minimum segment duration",
+)
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=4,
+    help="The number of parallel threads to use for data preparation",
+)
+def radio(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    min_seg_dur: float = 0.5,
+    num_jobs: int = 4,
+):
+    """Data preparation"""
+    prepare_radio(
+        corpus_dir,
+        output_dir=output_dir,
+        num_jobs=num_jobs,
+        min_segment_duration=min_seg_dur,
+    )
diff --git a/lhotse/bin/modes/recipes/sbcsae.py b/lhotse/bin/modes/recipes/sbcsae.py
@@ -0,0 +1,58 @@
+from typing import Optional, Sequence
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.sbcsae import download_sbcsae, prepare_sbcsae
+from lhotse.utils import Pathlike
+
+__all__ = ["sbcsae"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "--geolocation",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Include geographic coordinates of speakers' hometowns in the manifests.",
+)
+@click.option(
+    "--omit-realignments",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Only output the original corpus segmentation without boundary improvements.",
+)
+def sbcsae(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    geolocation: bool,
+    omit_realignments: bool,
+):
+    """SBCSAE data preparation."""
+    prepare_sbcsae(
+        corpus_dir,
+        output_dir=output_dir,
+        geolocation=geolocation,
+        omit_realignments=omit_realignments,
+    )
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "--force-download",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Force download.",
+)
+def sbcsae(
+    target_dir: Pathlike,
+    force_download: bool,
+):
+    """SBCSAE download."""
+    download_sbcsae(target_dir, force_download=force_download)