Skip to content

Commit

Permalink
Allow lhotse installation without torchaudio for a limited set of fea…
Browse files Browse the repository at this point in the history
…tures (#1231)

* Allow lhotse installation without torchaudio with a limited set of features

* Add a CI runner with torchaudio absent

* Add a CI runner with torchaudio absent

* Skip some tests when torchaudio is not present

* Add most basic tests only for missing torchaudio case

* Rollback changes to unit tests so far

* Restore previous CI setup but add a new test runner for non-torchaudio tests

* Fix the name of the missing torchaudio CI test

* Enable in-memory audio write/read with libsndfile when torchaudio is not available

* Remove a flaky redundant test case

---------

Co-authored-by: Piotr Żelasko <[email protected]>
  • Loading branch information
pzelasko and pzelasko authored Dec 8, 2023
1 parent 6c777da commit 78b3a12
Show file tree
Hide file tree
Showing 18 changed files with 220 additions and 44 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/missing_torchaudio.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: missing_torchaudio

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
missing_torchaudio:

runs-on: ubuntu-latest
strategy:
matrix:
include:
- python-version: "3.11"
torch-install-cmd: "pip install torch==2.0 --extra-index-url https://download.pytorch.org/whl/cpu"

fail-fast: false

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
cache: pip
cache-dependency-path: |
docs/requirements.txt
setup.py
- name: Install apt dependencies
run: |
sudo apt update
sudo apt install libsndfile1-dev libsndfile1 ffmpeg sox
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install wheel numpy
# Force the installation of a CPU-only PyTorch
${{ matrix.torch-install-cmd }}
# the torchaudio env var does nothing when torchaudio is installed, but doesn't require it's presence when it's not
LHOTSE_REQUIRE_TORCHAUDIO=0 pip install '.[tests]'
- name: Run basic tests only for missing torchaudio case
run: |
pytest test/test_missing_torchaudio.py
5 changes: 3 additions & 2 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ jobs:
- python-version: "3.9"
torch-install-cmd: "pip install torch==1.8.2+cpu torchaudio==0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html"
- python-version: "3.10"
torch-install-cmd: "pip install torch==1.12.1 torchaudio==0.12.1 torchdata==0.4.1 --extra-index-url https://download.pytorch.org/whl/cpu"
torch-install-cmd: "pip install torch==1.12.1 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cpu"
- python-version: "3.11"
torch-install-cmd: "pip install torch==2.0 torchaudio==2.0 torchdata==0.6 --extra-index-url https://download.pytorch.org/whl/cpu"
torch-install-cmd: "pip install torch==2.0 torchaudio==2.0 --extra-index-url https://download.pytorch.org/whl/cpu"

fail-fast: false

Expand All @@ -49,6 +49,7 @@ jobs:
pip install wheel numpy
# Force the installation of a CPU-only PyTorch
${{ matrix.torch-install-cmd }}
# the torchaudio env var does nothing when torchaudio is installed, but doesn't require it's presence when it's not
pip install '.[tests]'
# Enable some optional tests
pip install h5py dill smart_open[http] kaldifeat kaldi_native_io webdataset==0.2.5 s3prl scipy nara_wpe pyloudnorm
Expand Down
49 changes: 41 additions & 8 deletions lhotse/audio/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
verbose_audio_loading_exceptions,
)
from lhotse.augmentation import Resample
from lhotse.utils import Pathlike, Seconds, compute_num_samples
from lhotse.utils import Pathlike, Seconds, compute_num_samples, is_torchaudio_available

_FFMPEG_TORCHAUDIO_INFO_ENABLED: bool = True
_FFMPEG_TORCHAUDIO_INFO_ENABLED: bool = is_torchaudio_available()
CURRENT_AUDIO_BACKEND: Optional["AudioBackend"] = None


Expand Down Expand Up @@ -276,12 +276,20 @@ def read_audio(
)

def handles_special_case(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
return torchaudio_supports_ffmpeg() and isinstance(path_or_fd, BytesIO)
return (
is_torchaudio_available()
and torchaudio_supports_ffmpeg()
and isinstance(path_or_fd, BytesIO)
)

def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
# Technically it's applicable with regular files as well, but for now
# we're not enabling that feature.
return torchaudio_supports_ffmpeg() and isinstance(path_or_fd, BytesIO)
return (
is_torchaudio_available()
and torchaudio_supports_ffmpeg()
and isinstance(path_or_fd, BytesIO)
)


class TorchaudioDefaultBackend(AudioBackend):
Expand All @@ -298,6 +306,9 @@ def read_audio(
duration=duration,
)

def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
return is_torchaudio_available()


class TorchaudioFFMPEGBackend(AudioBackend):
"""
Expand All @@ -324,7 +335,7 @@ def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
For version == 2.0.x, we also need env var TORCHAUDIO_USE_BACKEND_DISPATCHER=1
For version >= 2.1.x, this will already be the default.
"""
return torchaudio_2_0_ffmpeg_enabled()
return is_torchaudio_available() and torchaudio_2_0_ffmpeg_enabled()


class LibsndfileBackend(AudioBackend):
Expand Down Expand Up @@ -357,9 +368,7 @@ def handles_special_case(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
)

def is_applicable(self, path_or_fd: Union[Pathlike, FileObject]) -> bool:
# Technically it's applicable with regular files as well, but for now
# we're not enabling that feature.
return not (sys.platform == "darwin") and isinstance(path_or_fd, BytesIO)
return True


class AudioreadBackend(AudioBackend):
Expand Down Expand Up @@ -481,6 +490,9 @@ def torchaudio_2_0_ffmpeg_enabled() -> bool:
Returns ``True`` when torchaudio.load supports "ffmpeg" backend.
This requires either version 2.1.x+ or 2.0.x with env var TORCHAUDIO_USE_BACKEND_DISPATCHER=1.
"""
if not is_torchaudio_available():
return False

import torchaudio
from packaging import version

Expand Down Expand Up @@ -1149,6 +1161,27 @@ def read_sph(
return audio, sampling_rate


def save_flac_file(
dest: Union[str, Path, BytesIO],
src: Union[torch.Tensor, np.ndarray],
sample_rate: int,
*args,
**kwargs,
):
if is_torchaudio_available():
torchaudio_save_flac_safe(
dest=dest, src=src, sample_rate=sample_rate, *args, **kwargs
)
else:
import soundfile as sf

kwargs.pop("bits_per_sample", None) # ignore this arg when not using torchaudio
if torch.is_tensor(src):
src = src.numpy()
src = src.squeeze(0)
sf.write(file=dest, data=src, samplerate=sample_rate, format="FLAC")


def torchaudio_save_flac_safe(
dest: Union[str, Path, BytesIO],
src: Union[torch.Tensor, np.ndarray],
Expand Down
4 changes: 2 additions & 2 deletions lhotse/audio/recording.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import torch
from _decimal import ROUND_HALF_UP

from lhotse.audio.backend import info, torchaudio_info, torchaudio_save_flac_safe
from lhotse.audio.backend import info, save_flac_file, torchaudio_info
from lhotse.audio.source import AudioSource
from lhotse.audio.utils import (
DurationMismatchError,
Expand Down Expand Up @@ -298,7 +298,7 @@ def move_to_memory(
channels=channels, offset=ifnone(offset, 0), duration=duration
)
stream = BytesIO()
torchaudio_save_flac_safe(
save_flac_file(
stream, torch.from_numpy(audio), self.sampling_rate, format=format
)
channels = (ifnone(channels, self.channel_ids),)
Expand Down
15 changes: 15 additions & 0 deletions lhotse/augmentation/torchaudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Seconds,
compute_num_samples,
during_docs_build,
is_torchaudio_available,
perturb_num_samples,
)

Expand Down Expand Up @@ -58,6 +59,7 @@ def __init__(self, effects: EffectsList):
self.effects = effects

def __call__(self, tensor: Union[torch.Tensor, np.ndarray], sampling_rate: int):
check_for_torchaudio()
check_torchaudio_version()
import torchaudio

Expand Down Expand Up @@ -113,6 +115,7 @@ class Speed(AudioTransform):
factor: float

def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
check_for_torchaudio()
resampler = get_or_create_resampler(
round(sampling_rate * self.factor), sampling_rate
)
Expand Down Expand Up @@ -152,6 +155,7 @@ def reverse_timestamps(
def get_or_create_resampler(
source_sampling_rate: int, target_sampling_rate: int
) -> torch.nn.Module:
check_for_torchaudio()
global _precompiled_resamplers

tpl = (source_sampling_rate, target_sampling_rate)
Expand Down Expand Up @@ -182,6 +186,7 @@ def __post_init__(self):
)

def __call__(self, samples: np.ndarray, *args, **kwargs) -> np.ndarray:
check_for_torchaudio()
if self.source_sampling_rate == self.target_sampling_rate:
return samples

Expand Down Expand Up @@ -234,6 +239,7 @@ class Tempo(AudioTransform):
factor: float

def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
check_for_torchaudio()
check_torchaudio_version()
import torchaudio

Expand Down Expand Up @@ -288,6 +294,7 @@ class Volume(AudioTransform):
factor: float

def __call__(self, samples: np.ndarray, sampling_rate: int) -> np.ndarray:
check_for_torchaudio()
check_torchaudio_version()
import torchaudio

Expand Down Expand Up @@ -356,3 +363,11 @@ def check_torchaudio_version():
"please upgrade your PyTorch to 1.7.1 and torchaudio to 0.7.2 (or higher) "
"to use them."
)


def check_for_torchaudio():
if not is_torchaudio_available():
raise RuntimeError(
"This transform is not supported in torchaudio-free Lhotse installation. "
"Please install torchaudio and try again."
)
9 changes: 7 additions & 2 deletions lhotse/augmentation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,14 @@ def __call__(self, nsource: int = 1) -> np.ndarray:
:param nsource: number of sources (RIR filters) to simulate. Default: 1.
:return: simulated RIR filter for all sources, shape: (nsource, nsample)
"""
from torchaudio.functional import highpass_biquad
from lhotse.augmentation.torchaudio import (
check_for_torchaudio,
get_or_create_resampler,
)

from lhotse.augmentation.torchaudio import get_or_create_resampler
check_for_torchaudio()

from torchaudio.functional import highpass_biquad

# the sample rate at which the original RIR filter is generated
ratio = 64
Expand Down
23 changes: 15 additions & 8 deletions lhotse/cut/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
deprecated,
fastcopy,
ifnone,
is_torchaudio_available,
overlaps,
to_hashable,
)
Expand Down Expand Up @@ -841,7 +842,6 @@ def save_audio(
to mono before saving.
:return: a new Cut instance.
"""
import torchaudio

storage_path = Path(storage_path)
samples = self.load_audio(**kwargs)
Expand All @@ -851,13 +851,20 @@ def save_audio(
if augment_fn is not None:
samples = augment_fn(samples, self.sampling_rate)

torchaudio.save(
str(storage_path),
torch.as_tensor(samples),
sample_rate=self.sampling_rate,
encoding=encoding,
bits_per_sample=bits_per_sample,
)
if is_torchaudio_available():
import torchaudio

torchaudio.save(
str(storage_path),
torch.as_tensor(samples),
sample_rate=self.sampling_rate,
encoding=encoding,
bits_per_sample=bits_per_sample,
)
else:
import soundfile as sf

sf.write(str(storage_path), samples, samplerate=self.sampling_rate)
recording = Recording(
id=storage_path.stem,
sampling_rate=self.sampling_rate,
Expand Down
4 changes: 2 additions & 2 deletions lhotse/cut/mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from intervaltree import IntervalTree

from lhotse.audio import Recording, VideoInfo, get_audio_duration_mismatch_tolerance
from lhotse.audio.backend import torchaudio_save_flac_safe
from lhotse.audio.backend import save_flac_file
from lhotse.audio.mixer import AudioMixer, VideoMixer, audio_energy
from lhotse.augmentation import (
AudioTransform,
Expand Down Expand Up @@ -386,7 +386,7 @@ def to_mono(
"""
samples = self.load_audio(mono_downmix=True)
stream = BytesIO()
torchaudio_save_flac_safe(
save_flac_file(
stream,
samples,
self.sampling_rate,
Expand Down
2 changes: 2 additions & 0 deletions lhotse/features/kaldi/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class FbankConfig:
num_filters: int = 80
num_mel_bins: Optional[int] = None # do not use
norm_filters: bool = False
torchaudio_compatible_mel_scale: bool = True
device: str = "cpu"

def __post_init__(self):
Expand Down Expand Up @@ -165,6 +166,7 @@ class MfccConfig:
low_freq: float = 20.0
high_freq: float = -400.0
num_filters: int = 23
torchaudio_compatible_mel_scale: bool = True
num_mel_bins: Optional[int] = None # do not use
norm_filters: bool = False
num_ceps: int = 13
Expand Down
5 changes: 1 addition & 4 deletions lhotse/recipes/voxpopuli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,10 @@
import shutil
import tarfile
import tempfile
from ast import literal_eval
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, Optional, Tuple, Union

import torch
import torchaudio
from torch.hub import download_url_to_file
from tqdm import tqdm

Expand Down
3 changes: 2 additions & 1 deletion lhotse/shar/writers/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import numpy as np
import torch
import torchaudio
from typing_extensions import Literal

from lhotse import Recording
Expand Down Expand Up @@ -45,6 +44,8 @@ def __init__(
shard_size: Optional[int] = 1000,
format: Literal["wav", "flac", "mp3"] = "flac",
):
import torchaudio

self.format = format
self.tar_writer = TarWriter(pattern, shard_size)
self.save_fn = torchaudio.save
Expand Down
Loading

0 comments on commit 78b3a12

Please sign in to comment.