Allow lhotse installation without torchaudio for a limited set of fea…

…tures (#1231) * Allow lhotse installation without torchaudio with a limited set of features * Add a CI runner with torchaudio absent * Add a CI runner with torchaudio absent * Skip some tests when torchaudio is not present * Add most basic tests only for missing torchaudio case * Rollback changes to unit tests so far * Restore previous CI setup but add a new test runner for non-torchaudio tests * Fix the name of the missing torchaudio CI test * Enable in-memory audio write/read with libsndfile when torchaudio is not available * Remove a flaky redundant test case --------- Co-authored-by: Piotr Żelasko <[email protected]>
lhotse-speech · Dec 8, 2023 · 78b3a12 · 78b3a12
1 parent 6c777da
commit 78b3a12
Show file tree

Hide file tree

Showing 18 changed files with 220 additions and 44 deletions.
diff --git a/.github/workflows/missing_torchaudio.yml b/.github/workflows/missing_torchaudio.yml
@@ -0,0 +1,48 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: missing_torchaudio
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  missing_torchaudio:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - python-version: "3.11"
+            torch-install-cmd: "pip install torch==2.0 --extra-index-url https://download.pytorch.org/whl/cpu"
+
+      fail-fast: false
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: pip
+        cache-dependency-path: |
+          docs/requirements.txt
+          setup.py
+    - name: Install apt dependencies
+      run: |
+        sudo apt update
+        sudo apt install libsndfile1-dev libsndfile1 ffmpeg sox
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install wheel numpy
+        # Force the installation of a CPU-only PyTorch
+        ${{ matrix.torch-install-cmd }}
+        # the torchaudio env var does nothing when torchaudio is installed, but doesn't require it's presence when it's not
+        LHOTSE_REQUIRE_TORCHAUDIO=0 pip install '.[tests]'
+    - name: Run basic tests only for missing torchaudio case
+      run: |
+        pytest test/test_missing_torchaudio.py
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -23,9 +23,9 @@ jobs:
           - python-version: "3.9"
             torch-install-cmd: "pip install torch==1.8.2+cpu torchaudio==0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html"
           - python-version: "3.10"
-            torch-install-cmd: "pip install torch==1.12.1 torchaudio==0.12.1 torchdata==0.4.1 --extra-index-url https://download.pytorch.org/whl/cpu"
+            torch-install-cmd: "pip install torch==1.12.1 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cpu"
           - python-version: "3.11"
-            torch-install-cmd: "pip install torch==2.0 torchaudio==2.0 torchdata==0.6 --extra-index-url https://download.pytorch.org/whl/cpu"
+            torch-install-cmd: "pip install torch==2.0 torchaudio==2.0 --extra-index-url https://download.pytorch.org/whl/cpu"
 
       fail-fast: false
 
@@ -49,6 +49,7 @@ jobs:
         pip install wheel numpy
         # Force the installation of a CPU-only PyTorch
         ${{ matrix.torch-install-cmd }}
+        # the torchaudio env var does nothing when torchaudio is installed, but doesn't require it's presence when it's not
         pip install '.[tests]'
         # Enable some optional tests
         pip install h5py dill smart_open[http] kaldifeat kaldi_native_io webdataset==0.2.5 s3prl scipy nara_wpe pyloudnorm

diff --git a/lhotse/audio/backend.py b/lhotse/audio/backend.py
@@ -19,9 +19,9 @@
     verbose_audio_loading_exceptions,
 )
 from lhotse.augmentation import Resample
-from lhotse.utils import Pathlike, Seconds, compute_num_samples
+from lhotse.utils import Pathlike, Seconds, compute_num_samples, is_torchaudio_available
 
-_FFMPEG_TORCHAUDIO_INFO_ENABLED: bool = True
+_FFMPEG_TORCHAUDIO_INFO_ENABLED: bool = is_torchaudio_available()
 CURRENT_AUDIO_BACKEND: Optional["AudioBackend"] = None
 
 
@@ -276,12 +276,20 @@ def read_audio(
         )
 
     def handles_special_case(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
-        return torchaudio_supports_ffmpeg() and isinstance(path_or_fd, BytesIO)
+        return (
+            is_torchaudio_available()
+            and torchaudio_supports_ffmpeg()
+            and isinstance(path_or_fd, BytesIO)
+        )
 
     def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
         # Technically it's applicable with regular files as well, but for now
         # we're not enabling that feature.
-        return torchaudio_supports_ffmpeg() and isinstance(path_or_fd, BytesIO)
+        return (
+            is_torchaudio_available()
+            and torchaudio_supports_ffmpeg()
+            and isinstance(path_or_fd, BytesIO)
+        )
 
 
 class TorchaudioDefaultBackend(AudioBackend):
@@ -298,6 +306,9 @@ def read_audio(
             duration=duration,
         )
 
+    def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
+        return is_torchaudio_available()
+
 
 class TorchaudioFFMPEGBackend(AudioBackend):
     """
@@ -324,7 +335,7 @@ def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
         For version == 2.0.x, we also need env var TORCHAUDIO_USE_BACKEND_DISPATCHER=1
         For version >= 2.1.x, this will already be the default.
         """
-        return torchaudio_2_0_ffmpeg_enabled()
+        return is_torchaudio_available() and torchaudio_2_0_ffmpeg_enabled()
 
 
 class LibsndfileBackend(AudioBackend):
@@ -357,9 +368,7 @@ def handles_special_case(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
         )
 
     def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
-        # Technically it's applicable with regular files as well, but for now
-        # we're not enabling that feature.
-        return not (sys.platform == "darwin") and isinstance(path_or_fd, BytesIO)
+        return True
 
 
 class AudioreadBackend(AudioBackend):
@@ -481,6 +490,9 @@ def torchaudio_2_0_ffmpeg_enabled() -> bool:
     Returns ``True`` when torchaudio.load supports "ffmpeg" backend.
     This requires either version 2.1.x+ or 2.0.x with env var TORCHAUDIO_USE_BACKEND_DISPATCHER=1.
     """
+    if not is_torchaudio_available():
+        return False
+
     import torchaudio
     from packaging import version
 
@@ -1149,6 +1161,27 @@ def read_sph(
     return audio, sampling_rate
 
 
+def save_flac_file(
+    dest: Union[str, Path, BytesIO],
+    src: Union[torch.Tensor, np.ndarray],
+    sample_rate: int,
+    *args,
+    **kwargs,
+):
+    if is_torchaudio_available():
+        torchaudio_save_flac_safe(
+            dest=dest, src=src, sample_rate=sample_rate, *args, **kwargs
+        )
+    else:
+        import soundfile as sf
+
+        kwargs.pop("bits_per_sample", None)  # ignore this arg when not using torchaudio
+        if torch.is_tensor(src):
+            src = src.numpy()
+        src = src.squeeze(0)
+        sf.write(file=dest, data=src, samplerate=sample_rate, format="FLAC")
+
+
 def torchaudio_save_flac_safe(
     dest: Union[str, Path, BytesIO],
     src: Union[torch.Tensor, np.ndarray],

diff --git a/lhotse/audio/recording.py b/lhotse/audio/recording.py
@@ -8,7 +8,7 @@
 import torch
 from _decimal import ROUND_HALF_UP
 
-from lhotse.audio.backend import info, torchaudio_info, torchaudio_save_flac_safe
+from lhotse.audio.backend import info, save_flac_file, torchaudio_info
 from lhotse.audio.source import AudioSource
 from lhotse.audio.utils import (
     DurationMismatchError,
@@ -298,7 +298,7 @@ def move_to_memory(
             channels=channels, offset=ifnone(offset, 0), duration=duration
         )
         stream = BytesIO()
-        torchaudio_save_flac_safe(
+        save_flac_file(
             stream, torch.from_numpy(audio), self.sampling_rate, format=format
         )
         channels = (ifnone(channels, self.channel_ids),)

diff --git a/lhotse/augmentation/torchaudio.py b/lhotse/augmentation/torchaudio.py
@@ -11,6 +11,7 @@
     Seconds,
     compute_num_samples,
     during_docs_build,
+    is_torchaudio_available,
     perturb_num_samples,
 )
 
@@ -58,6 +59,7 @@ def __init__(self, effects: EffectsList):
         self.effects = effects
 
     def __call__(self, tensor: Union[torch.Tensor, np.ndarray], sampling_rate: int):
+        check_for_torchaudio()
         check_torchaudio_version()
         import torchaudio
 
@@ -113,6 +115,7 @@ class Speed(AudioTransform):
     factor: float
 
     def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
+        check_for_torchaudio()
         resampler = get_or_create_resampler(
             round(sampling_rate * self.factor), sampling_rate
         )
@@ -152,6 +155,7 @@ def reverse_timestamps(
 def get_or_create_resampler(
     source_sampling_rate: int, target_sampling_rate: int
 ) -> torch.nn.Module:
+    check_for_torchaudio()
     global _precompiled_resamplers
 
     tpl = (source_sampling_rate, target_sampling_rate)
@@ -182,6 +186,7 @@ def __post_init__(self):
         )
 
     def __call__(self, samples: np.ndarray, *args, **kwargs) -> np.ndarray:
+        check_for_torchaudio()
         if self.source_sampling_rate == self.target_sampling_rate:
             return samples
 
@@ -234,6 +239,7 @@ class Tempo(AudioTransform):
     factor: float
 
     def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
+        check_for_torchaudio()
         check_torchaudio_version()
         import torchaudio
 
@@ -288,6 +294,7 @@ class Volume(AudioTransform):
     factor: float
 
     def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
+        check_for_torchaudio()
         check_torchaudio_version()
         import torchaudio
 
@@ -356,3 +363,11 @@ def check_torchaudio_version():
             "please upgrade your PyTorch to 1.7.1 and torchaudio to 0.7.2 (or higher) "
             "to use them."
         )
+
+
+def check_for_torchaudio():
+    if not is_torchaudio_available():
+        raise RuntimeError(
+            "This transform is not supported in torchaudio-free Lhotse installation. "
+            "Please install torchaudio and try again."
+        )
diff --git a/lhotse/augmentation/utils.py b/lhotse/augmentation/utils.py
@@ -105,9 +105,14 @@ def __call__(self, nsource: int = 1) -> np.ndarray:
         :param nsource: number of sources (RIR filters) to simulate. Default: 1.
         :return: simulated RIR filter for all sources, shape: (nsource, nsample)
         """
-        from torchaudio.functional import highpass_biquad
+        from lhotse.augmentation.torchaudio import (
+            check_for_torchaudio,
+            get_or_create_resampler,
+        )
 
-        from lhotse.augmentation.torchaudio import get_or_create_resampler
+        check_for_torchaudio()
+
+        from torchaudio.functional import highpass_biquad
 
         # the sample rate at which the original RIR filter is generated
         ratio = 64

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
@@ -23,6 +23,7 @@
     deprecated,
     fastcopy,
     ifnone,
+    is_torchaudio_available,
     overlaps,
     to_hashable,
 )
@@ -841,7 +842,6 @@ def save_audio(
             to mono before saving.
         :return: a new Cut instance.
         """
-        import torchaudio
 
         storage_path = Path(storage_path)
         samples = self.load_audio(**kwargs)
@@ -851,13 +851,20 @@ def save_audio(
         if augment_fn is not None:
             samples = augment_fn(samples, self.sampling_rate)
 
-        torchaudio.save(
-            str(storage_path),
-            torch.as_tensor(samples),
-            sample_rate=self.sampling_rate,
-            encoding=encoding,
-            bits_per_sample=bits_per_sample,
-        )
+        if is_torchaudio_available():
+            import torchaudio
+
+            torchaudio.save(
+                str(storage_path),
+                torch.as_tensor(samples),
+                sample_rate=self.sampling_rate,
+                encoding=encoding,
+                bits_per_sample=bits_per_sample,
+            )
+        else:
+            import soundfile as sf
+
+            sf.write(str(storage_path), samples, samplerate=self.sampling_rate)
         recording = Recording(
             id=storage_path.stem,
             sampling_rate=self.sampling_rate,

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
@@ -11,7 +11,7 @@
 from intervaltree import IntervalTree
 
 from lhotse.audio import Recording, VideoInfo, get_audio_duration_mismatch_tolerance
-from lhotse.audio.backend import torchaudio_save_flac_safe
+from lhotse.audio.backend import save_flac_file
 from lhotse.audio.mixer import AudioMixer, VideoMixer, audio_energy
 from lhotse.augmentation import (
     AudioTransform,
@@ -386,7 +386,7 @@ def to_mono(
         """
         samples = self.load_audio(mono_downmix=True)
         stream = BytesIO()
-        torchaudio_save_flac_safe(
+        save_flac_file(
             stream,
             samples,
             self.sampling_rate,

diff --git a/lhotse/features/kaldi/extractors.py b/lhotse/features/kaldi/extractors.py
@@ -40,6 +40,7 @@ class FbankConfig:
     num_filters: int = 80
     num_mel_bins: Optional[int] = None  # do not use
     norm_filters: bool = False
+    torchaudio_compatible_mel_scale: bool = True
     device: str = "cpu"
 
     def __post_init__(self):
@@ -165,6 +166,7 @@ class MfccConfig:
     low_freq: float = 20.0
     high_freq: float = -400.0
     num_filters: int = 23
+    torchaudio_compatible_mel_scale: bool = True
     num_mel_bins: Optional[int] = None  # do not use
     norm_filters: bool = False
     num_ceps: int = 13

diff --git a/lhotse/recipes/voxpopuli.py b/lhotse/recipes/voxpopuli.py
@@ -28,13 +28,10 @@
 import shutil
 import tarfile
 import tempfile
-from ast import literal_eval
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
-import torch
-import torchaudio
 from torch.hub import download_url_to_file
 from tqdm import tqdm
 

diff --git a/lhotse/shar/writers/audio.py b/lhotse/shar/writers/audio.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import torch
-import torchaudio
 from typing_extensions import Literal
 
 from lhotse import Recording
@@ -45,6 +44,8 @@ def __init__(
         shard_size: Optional[int] = 1000,
         format: Literal["wav", "flac", "mp3"] = "flac",
     ):
+        import torchaudio
+
         self.format = format
         self.tar_writer = TarWriter(pattern, shard_size)
         self.save_fn = torchaudio.save