Fixes for #1152 #1153 and #1154 (#1156)

* Tutorial materials in main readme page * Fixes for #1152 #1153 and #1154 * Fix isinstance use in Python 3.7-3.9
lhotse-speech · Sep 18, 2023 · 3dde48d · 3dde48d
1 parent 567ba29
commit 3dde48d
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 15 deletions.
diff --git a/lhotse/audio/backend.py b/lhotse/audio/backend.py
@@ -872,7 +872,7 @@ def read_opus_ffmpeg(
     if duration is not None:
         cmd += f" -t {duration}"
     # Add the input specifier after offset and duration.
-    cmd += f" -i {path}"
+    cmd += f" -i '{path}'"
     # Optionally resample the output.
     if force_opus_sampling_rate is not None:
         cmd += f" -ar {force_opus_sampling_rate}"
@@ -1028,22 +1028,28 @@ def read_audio(
 
 
 def info(
-    path: Pathlike,
+    path: Union[Pathlike, BytesIO],
     force_opus_sampling_rate: Optional[int] = None,
     force_read_audio: bool = False,
 ) -> LibsndfileCompatibleAudioInfo:
+
+    is_path = isinstance(path, (Path, str))
+
     if force_read_audio:
         # This is a reliable fallback for situations when the user knows that audio files do not
         # have duration metadata in their headers.
         # We will use "audioread" backend that spawns an ffmpeg process, reads the audio,
         # and computes the duration.
+        assert (
+            is_path
+        ), f"info(obj, force_read_audio=True) is not supported for object of type: {type(path)}"
         return audioread_info(str(path))
 
-    if path.suffix.lower() == ".opus":
+    if is_path and Path(path).suffix.lower() == ".opus":
         # We handle OPUS as a special case because we might need to force a certain sampling rate.
         return opus_info(path, force_opus_sampling_rate=force_opus_sampling_rate)
 
-    elif path.suffix.lower() == ".sph":
+    if is_path and Path(path).suffix.lower() == ".sph":
         # We handle SPHERE as another special case because some old codecs (i.e. "shorten" codec)
         # can't be handled by neither pysoundfile nor pyaudioread.
         return sph_info(path)

diff --git a/lhotse/bin/modes/manipulation.py b/lhotse/bin/modes/manipulation.py
@@ -142,10 +142,22 @@ def copy_feats_worker(
 @click.option(
     "--pad/--no-pad",
     default=True,
-    help="Whether to pad the split output idx with zeros (e.g. 01, 02, .., 10).",
+    help="Whether to pad the split output idx with zeros (e.g. 00, 01, 02, .., 10).",
+)
+@click.option(
+    "-i",
+    "--start-idx",
+    type=int,
+    default=0,
+    help="Count splits starting from this index.",
 )
 def split(
-    num_splits: int, manifest: Pathlike, output_dir: Pathlike, shuffle: bool, pad: bool
+    num_splits: int,
+    manifest: Pathlike,
+    output_dir: Pathlike,
+    shuffle: bool,
+    pad: bool,
+    start_idx: int,
 ):
     """
     Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR.
@@ -161,8 +173,8 @@ def split(
     parts = any_set.split(num_splits=num_splits, shuffle=shuffle)
     output_dir.mkdir(parents=True, exist_ok=True)
     num_digits = len(str(num_splits))
-    for idx, part in enumerate(parts):
-        idx = f"{idx + 1}".zfill(num_digits) if pad else str(idx + 1)
+    for idx, part in enumerate(parts, start=start_idx):
+        idx = f"{idx}".zfill(num_digits) if pad else str(idx)
         part.to_file((output_dir / manifest.stem).with_suffix(f".{idx}{suffix}"))
 
 
@@ -172,7 +184,16 @@ def split(
 )
 @click.argument("output_dir", type=click.Path(allow_dash=True))
 @click.argument("chunk_size", type=int)
-def split_lazy(manifest: Pathlike, output_dir: Pathlike, chunk_size: int):
+@click.option(
+    "-i",
+    "--start-idx",
+    type=int,
+    default=0,
+    help="Count splits starting from this index.",
+)
+def split_lazy(
+    manifest: Pathlike, output_dir: Pathlike, chunk_size: int, start_idx: int
+):
     """
     Load MANIFEST (lazily if in JSONL format) and split it into parts,
     each with CHUNK_SIZE items.
@@ -187,7 +208,10 @@ def split_lazy(manifest: Pathlike, output_dir: Pathlike, chunk_size: int):
     manifest = Path(manifest)
     any_set = load_manifest_lazy_or_eager(manifest)
     any_set.split_lazy(
-        output_dir=output_dir, chunk_size=chunk_size, prefix=manifest.stem
+        output_dir=output_dir,
+        chunk_size=chunk_size,
+        prefix=manifest.stem,
+        start_idx=start_idx,
     )
 
 

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
@@ -984,7 +984,10 @@ def total_duration_(segments: List[TimeSpan]) -> float:
         print(tabulate(speaker_stats, headers="firstrow", tablefmt="fancy_grid"))
 
     def split(
-        self, num_splits: int, shuffle: bool = False, drop_last: bool = False
+        self,
+        num_splits: int,
+        shuffle: bool = False,
+        drop_last: bool = False,
     ) -> List["CutSet"]:
         """
         Split the :class:`~lhotse.CutSet` into ``num_splits`` pieces of equal size.
@@ -1000,7 +1003,10 @@ def split(
         return [
             CutSet.from_cuts(subset)
             for subset in split_sequence(
-                self, num_splits=num_splits, shuffle=shuffle, drop_last=drop_last
+                self,
+                num_splits=num_splits,
+                shuffle=shuffle,
+                drop_last=drop_last,
             )
         ]
 
@@ -1010,6 +1016,7 @@ def split_lazy(
         chunk_size: int,
         prefix: str = "",
         num_digits: int = 8,
+        start_idx: int = 0,
     ) -> List["CutSet"]:
         """
         Splits a manifest (either lazily or eagerly opened) into chunks, each
@@ -1027,6 +1034,7 @@ def split_lazy(
         :param chunk_size: the number of items in each chunk.
         :param prefix: the prefix of each manifest.
         :param num_digits: the width of ``split_idx``, which will be left padded with zeros to achieve it.
+        :param start_idx: The split index to start counting from (default is ``0``).
         :return: a list of lazily opened chunk manifests.
         """
         return split_manifest_lazy(
@@ -1035,6 +1043,7 @@ def split_lazy(
             chunk_size=chunk_size,
             prefix=prefix,
             num_digits=num_digits,
+            start_idx=start_idx,
         )
 
     def subset(

diff --git a/lhotse/utils.py b/lhotse/utils.py
@@ -280,6 +280,7 @@ def split_manifest_lazy(
     chunk_size: int,
     prefix: str = "",
     num_digits: int = 8,
+    start_idx: int = 0,
 ) -> List:
     """
     Splits a manifest (either lazily or eagerly opened) into chunks, each
@@ -297,6 +298,7 @@ def split_manifest_lazy(
     :param chunk_size: the number of items in each chunk.
     :param prefix: the prefix of each manifest.
     :param num_digits: the width of ``split_idx``, which will be left padded with zeros to achieve it.
+    :param start_idx: The split index to start counting from (default is ``0``).
     :return: a list of lazily opened chunk manifests.
     """
     from lhotse.serialization import SequentialJsonlWriter
@@ -308,7 +310,7 @@ def split_manifest_lazy(
         prefix = "split"
 
     items = iter(it)
-    split_idx = 0
+    split_idx = start_idx
     splits = []
     while True:
         try:

diff --git a/test/audio/test_audio_reads.py b/test/audio/test_audio_reads.py
@@ -1,4 +1,7 @@
-from tempfile import NamedTemporaryFile
+import shutil
+from io import BytesIO
+from pathlib import Path
+from tempfile import NamedTemporaryFile, TemporaryDirectory
 
 import numpy as np
 import pytest
@@ -7,7 +10,12 @@
 
 import lhotse
 from lhotse import AudioSource, Recording
-from lhotse.audio.backend import read_opus_ffmpeg, read_opus_torchaudio, torchaudio_load
+from lhotse.audio.backend import (
+    info,
+    read_opus_ffmpeg,
+    read_opus_torchaudio,
+    torchaudio_load,
+)
 
 
 @pytest.mark.parametrize(
@@ -78,6 +86,14 @@ def test_resample_opus():
     r1.load_audio()
 
 
+def test_opus_name_with_whitespaces():
+    with TemporaryDirectory() as d:
+        path_with_ws = Path(d) / "white space.opus"
+        shutil.copy("test/fixtures/mono_c0.opus", path_with_ws)
+        r = Recording.from_file(path_with_ws)
+        r.load_audio()  # does not raise
+
+
 @pytest.mark.parametrize(
     "path",
     [
@@ -223,3 +239,17 @@ def test_audio_loading_optimization_returns_expected_num_samples():
     cut.duration = reduced_num_samples / cut.sampling_rate
     audio = cut.load_audio()
     assert audio.shape[1] == reduced_num_samples
+
+
+def test_audio_info_from_bytes_io():
+    audio_filelike = BytesIO(open("test/fixtures/mono_c0.wav", "rb").read())
+
+    meta = info(audio_filelike)
+    assert meta.duration == 0.5
+    assert meta.frames == 4000
+    assert meta.samplerate == 8000
+    assert meta.channels == 1
+
+    with pytest.raises(AssertionError):
+        # force_read_audio won't work with a filelike object
+        assert info(audio_filelike, force_read_audio=True)